From ff57d797932582945955691d289ee41245280a71 Mon Sep 17 00:00:00 2001
From: "Dr. Chat" <arkolbed@gmail.com>
Date: Sat, 30 Jul 2016 23:02:06 -0500
Subject: [PATCH] TextureCache: Flush uploads if the staging buffer overflows
 in a frame.

---
 src/xenia/gpu/vulkan/texture_cache.cc | 269 ++++++++++++++++++++++----
 src/xenia/gpu/vulkan/texture_cache.h  |  12 +-
 2 files changed, 244 insertions(+), 37 deletions(-)

diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc
index 0e557e2cc..b95f3a1ce 100644
--- a/src/xenia/gpu/vulkan/texture_cache.cc
+++ b/src/xenia/gpu/vulkan/texture_cache.cc
@@ -25,7 +25,7 @@ namespace vulkan {
 using xe::ui::vulkan::CheckResult;
 
 constexpr uint32_t kMaxTextureSamplers = 32;
-constexpr VkDeviceSize kStagingBufferSize = 64 * 1024 * 1024;
+constexpr VkDeviceSize kStagingBufferSize = 32 * 1024 * 1024;
 
 struct TextureConfig {
   TextureFormat guest_format;
@@ -118,10 +118,10 @@ TextureCache::TextureCache(Memory* memory, RegisterFile* register_file,
   descriptor_pool_info.pNext = nullptr;
   descriptor_pool_info.flags =
       VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
-  descriptor_pool_info.maxSets = 8192;
+  descriptor_pool_info.maxSets = 32768;
   VkDescriptorPoolSize pool_sizes[1];
   pool_sizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
-  pool_sizes[0].descriptorCount = 8192;
+  pool_sizes[0].descriptorCount = 32768;
   descriptor_pool_info.poolSizeCount = 1;
   descriptor_pool_info.pPoolSizes = pool_sizes;
   auto err = vkCreateDescriptorPool(*device_, &descriptor_pool_info, nullptr,
@@ -160,9 +160,15 @@ TextureCache::TextureCache(Memory* memory, RegisterFile* register_file,
   invalidated_textures_sets_[0].reserve(64);
   invalidated_textures_sets_[1].reserve(64);
   invalidated_textures_ = &invalidated_textures_sets_[0];
+
+  device_queue_ = device_->AcquireQueue();
 }
 
 TextureCache::~TextureCache() {
+  if (device_queue_) {
+    device_->ReleaseQueue(device_queue_);
+  }
+
   for (auto it = samplers_.begin(); it != samplers_.end(); ++it) {
     vkDestroySampler(*device_, it->second->sampler, nullptr);
     delete it->second;
@@ -204,26 +210,35 @@ TextureCache::Texture* TextureCache::AllocateTexture(
                         ? config.host_format
                         : VK_FORMAT_R8G8B8A8_UNORM;
 
+  image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
+  image_info.usage = VK_IMAGE_USAGE_SAMPLED_BIT |
+                     VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+                     VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+
+  // Check the device limits for the format before we create it.
   VkFormatProperties props;
+  vkGetPhysicalDeviceFormatProperties(*device_, format, &props);
   uint32_t required_flags = VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
                             VK_FORMAT_FEATURE_BLIT_DST_BIT |
                             VK_FORMAT_FEATURE_BLIT_SRC_BIT;
-  vkGetPhysicalDeviceFormatProperties(*device_, format, &props);
   if ((props.optimalTilingFeatures & required_flags) != required_flags) {
     // Texture needs conversion on upload to a native format.
     // assert_always();
   }
 
+  VkImageFormatProperties image_props;
+  vkGetPhysicalDeviceImageFormatProperties(
+      *device_, format, image_info.imageType, image_info.tiling,
+      image_info.usage, image_info.flags, &image_props);
+
+  // TODO(DrChat): Actually check the image properties.
+
   image_info.format = format;
   image_info.extent = {texture_info.width + 1, texture_info.height + 1,
                        texture_info.depth + 1};
   image_info.mipLevels = 1;
   image_info.arrayLayers = 1;
   image_info.samples = VK_SAMPLE_COUNT_1_BIT;
-  image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
-  image_info.usage = VK_IMAGE_USAGE_SAMPLED_BIT |
-                     VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
-                     VK_IMAGE_USAGE_TRANSFER_DST_BIT;
   image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
   image_info.queueFamilyIndexCount = 0;
   image_info.pQueueFamilyIndices = nullptr;
@@ -257,6 +272,7 @@ TextureCache::Texture* TextureCache::AllocateTexture(
   texture->image_memory = memory;
   texture->memory_offset = 0;
   texture->memory_size = mem_requirements.size;
+  texture->access_watch_handle = 0;
   texture->texture_info = texture_info;
   return texture;
 }
@@ -380,7 +396,7 @@ TextureCache::Texture* TextureCache::Demand(
     }
   }
 
-  if (!command_buffer || texture_info.dimension != Dimension::k2D) {
+  if (!command_buffer) {
     // Texture not found and no command buffer was passed, preventing us from
     // uploading a new one.
     return nullptr;
@@ -402,6 +418,8 @@ TextureCache::Texture* TextureCache::Demand(
     } break;
 
     case Dimension::kCube: {
+      uploaded = UploadTextureCube(command_buffer, completion_fence, texture,
+                                   texture_info);
     } break;
 
     default:
@@ -515,6 +533,10 @@ TextureCache::TextureView* TextureCache::DemandView(Texture* texture,
         VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
   }
 
+  if (texture->texture_info.dimension == Dimension::kCube) {
+    view_info.subresourceRange.layerCount = 6;
+  }
+
   VkImageView view;
   auto status = vkCreateImageView(*device_, &view_info, nullptr, &view);
   CheckResult(status, "vkCreateImageView");
@@ -748,44 +770,55 @@ void TextureSwap(Endian endianness, void* dest, const void* src,
   }
 }
 
-bool TextureCache::UploadTexture2D(
+void TextureCache::FlushPendingCommands(
     VkCommandBuffer command_buffer,
-    std::shared_ptr<ui::vulkan::Fence> completion_fence, Texture* dest,
-    TextureInfo src) {
-#if FINE_GRAINED_DRAW_SCOPES
-  SCOPE_profile_cpu_f("gpu");
-#endif  // FINE_GRAINED_DRAW_SCOPES
+    std::shared_ptr<ui::vulkan::Fence> completion_fence) {
+  auto status = vkEndCommandBuffer(command_buffer);
+  CheckResult(status, "vkEndCommandBuffer");
 
-  assert_true(src.dimension == Dimension::k2D);
+  VkSubmitInfo submit_info;
+  std::memset(&submit_info, 0, sizeof(submit_info));
+  submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+  submit_info.commandBufferCount = 1;
+  submit_info.pCommandBuffers = &command_buffer;
 
-  size_t unpack_length = src.output_length;
-  if (!staging_buffer_.CanAcquire(unpack_length)) {
-    // Need to have unique memory for every upload for at least one frame. If we
-    // run out of memory, we need to flush all queued upload commands to the
-    // GPU.
-    // TODO: Actually flush commands.
-    assert_always();
+  if (device_queue_) {
+    auto status =
+        vkQueueSubmit(device_queue_, 1, &submit_info, *completion_fence);
+    CheckResult(status, "vkQueueSubmit");
+  } else {
+    std::lock_guard<std::mutex>(device_->primary_queue_mutex());
+
+    auto status = vkQueueSubmit(device_->primary_queue(), 1, &submit_info,
+                                *completion_fence);
+    CheckResult(status, "vkQueueSubmit");
   }
 
-  // Grab some temporary memory for staging.
-  auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence);
-  assert_not_null(alloc);
+  VkFence fences[] = {*completion_fence};
+  vkWaitForFences(*device_, 1, fences, VK_TRUE, -1);
+  staging_buffer_.Scavenge();
+  vkResetFences(*device_, 1, fences);
 
-  // Upload texture into GPU memory.
-  // TODO: If the GPU supports it, we can submit a compute batch to convert the
-  // texture and copy it to its destination. Otherwise, fallback to conversion
-  // on the CPU.
+  // Reset the command buffer and put it back into the recording state.
+  vkResetCommandBuffer(command_buffer, 0);
+  VkCommandBufferBeginInfo begin_info;
+  std::memset(&begin_info, 0, sizeof(begin_info));
+  begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+  begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+  vkBeginCommandBuffer(command_buffer, &begin_info);
+}
+
+void TextureCache::ConvertTexture2D(uint8_t* dest, const TextureInfo& src) {
   void* host_address = memory_->TranslatePhysical(src.guest_address);
   if (!src.is_tiled) {
     if (src.size_2d.input_pitch == src.size_2d.output_pitch) {
       // Fast path copy entire image.
-      TextureSwap(src.endianness, alloc->host_ptr, host_address, unpack_length);
+      TextureSwap(src.endianness, dest, host_address, src.output_length);
     } else {
       // Slow path copy row-by-row because strides differ.
       // UNPACK_ROW_LENGTH only works for uncompressed images, and likely does
       // this exact thing under the covers, so we just always do it here.
       const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
-      uint8_t* dest = reinterpret_cast<uint8_t*>(alloc->host_ptr);
       uint32_t pitch =
           std::min(src.size_2d.input_pitch, src.size_2d.output_pitch);
       for (uint32_t y = 0;
@@ -802,7 +835,6 @@ bool TextureCache::UploadTexture2D(
 
     // TODO(benvanik): optimize this inner loop (or work by tiles).
     const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
-    uint8_t* dest = reinterpret_cast<uint8_t*>(alloc->host_ptr);
     uint32_t bytes_per_block = src.format_info->block_width *
                                src.format_info->block_height *
                                src.format_info->bits_per_pixel / 8;
@@ -830,7 +862,103 @@ bool TextureCache::UploadTexture2D(
       }
     }
   }
+}
 
+void TextureCache::ConvertTextureCube(uint8_t* dest, const TextureInfo& src) {
+  void* host_address = memory_->TranslatePhysical(src.guest_address);
+  if (!src.is_tiled) {
+    if (src.size_cube.input_pitch == src.size_cube.output_pitch) {
+      // Fast path copy entire image.
+      TextureSwap(src.endianness, dest, host_address, src.output_length);
+    } else {
+      // Slow path copy row-by-row because strides differ.
+      // UNPACK_ROW_LENGTH only works for uncompressed images, and likely does
+      // this exact thing under the covers, so we just always do it here.
+      const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
+      for (int face = 0; face < 6; ++face) {
+        uint32_t pitch =
+            std::min(src.size_cube.input_pitch, src.size_cube.output_pitch);
+        for (uint32_t y = 0; y < src.size_cube.block_height; y++) {
+          TextureSwap(src.endianness, dest, src_mem, pitch);
+          src_mem += src.size_cube.input_pitch;
+          dest += src.size_cube.output_pitch;
+        }
+      }
+    }
+  } else {
+    // TODO(benvanik): optimize this inner loop (or work by tiles).
+    const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
+    uint32_t bytes_per_block = src.format_info->block_width *
+                               src.format_info->block_height *
+                               src.format_info->bits_per_pixel / 8;
+    // Tiled textures can be packed; get the offset into the packed texture.
+    uint32_t offset_x;
+    uint32_t offset_y;
+    TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
+    auto bpp = (bytes_per_block >> 2) +
+               ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
+    for (int face = 0; face < 6; ++face) {
+      for (uint32_t y = 0, output_base_offset = 0;
+           y < src.size_cube.block_height;
+           y++, output_base_offset += src.size_cube.output_pitch) {
+        auto input_base_offset = TextureInfo::TiledOffset2DOuter(
+            offset_y + y,
+            (src.size_cube.input_width / src.format_info->block_width), bpp);
+        for (uint32_t x = 0, output_offset = output_base_offset;
+             x < src.size_cube.block_width;
+             x++, output_offset += bytes_per_block) {
+          auto input_offset =
+              TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp,
+                                              input_base_offset) >>
+              bpp;
+          TextureSwap(src.endianness, dest + output_offset,
+                      src_mem + input_offset * bytes_per_block,
+                      bytes_per_block);
+        }
+      }
+      src_mem += src.size_cube.input_face_length;
+      dest += src.size_cube.output_face_length;
+    }
+  }
+}
+
+bool TextureCache::UploadTexture2D(
+    VkCommandBuffer command_buffer,
+    std::shared_ptr<ui::vulkan::Fence> completion_fence, Texture* dest,
+    const TextureInfo& src) {
+#if FINE_GRAINED_DRAW_SCOPES
+  SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
+
+  assert_true(src.dimension == Dimension::k2D);
+
+  size_t unpack_length = src.output_length;
+  if (!staging_buffer_.CanAcquire(unpack_length)) {
+    // Need to have unique memory for every upload for at least one frame. If we
+    // run out of memory, we need to flush all queued upload commands to the
+    // GPU.
+    FlushPendingCommands(command_buffer, completion_fence);
+
+    // Uploads have been flushed. Continue.
+    if (!staging_buffer_.CanAcquire(unpack_length)) {
+      // The staging buffer isn't big enough to hold this texture.
+      XELOGE(
+          "TextureCache staging buffer is too small! (uploading 0x%.8X bytes)",
+          unpack_length);
+      assert_always();
+      return false;
+    }
+  }
+
+  // Grab some temporary memory for staging.
+  auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence);
+  assert_not_null(alloc);
+
+  // Upload texture into GPU memory.
+  // TODO: If the GPU supports it, we can submit a compute batch to convert the
+  // texture and copy it to its destination. Otherwise, fallback to conversion
+  // on the CPU.
+  ConvertTexture2D(reinterpret_cast<uint8_t*>(alloc->host_ptr), src);
   staging_buffer_.Flush(alloc);
 
   // Transition the texture into a transfer destination layout.
@@ -879,8 +1007,79 @@ bool TextureCache::UploadTexture2D(
 bool TextureCache::UploadTextureCube(
     VkCommandBuffer command_buffer,
     std::shared_ptr<ui::vulkan::Fence> completion_fence, Texture* dest,
-    TextureInfo src) {
-  return false;
+    const TextureInfo& src) {
+  assert_true(src.dimension == Dimension::kCube);
+
+  size_t unpack_length = src.output_length;
+  if (!staging_buffer_.CanAcquire(unpack_length)) {
+    // Need to have unique memory for every upload for at least one frame. If we
+    // run out of memory, we need to flush all queued upload commands to the
+    // GPU.
+    FlushPendingCommands(command_buffer, completion_fence);
+
+    // Uploads have been flushed. Continue.
+    if (!staging_buffer_.CanAcquire(unpack_length)) {
+      // The staging buffer isn't big enough to hold this texture.
+      XELOGE(
+          "TextureCache staging buffer is too small! (uploading 0x%.8X bytes)",
+          unpack_length);
+      assert_always();
+      return false;
+    }
+  }
+
+  // Grab some temporary memory for staging.
+  auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence);
+  assert_not_null(alloc);
+
+  // Upload texture into GPU memory.
+  // TODO: If the GPU supports it, we can submit a compute batch to convert the
+  // texture and copy it to its destination. Otherwise, fallback to conversion
+  // on the CPU.
+  ConvertTextureCube(reinterpret_cast<uint8_t*>(alloc->host_ptr), src);
+  staging_buffer_.Flush(alloc);
+
+  // Transition the texture into a transfer destination layout.
+  VkImageMemoryBarrier barrier;
+  barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+  barrier.pNext = nullptr;
+  barrier.srcAccessMask = 0;
+  barrier.dstAccessMask =
+      VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT;
+  barrier.oldLayout = dest->image_layout;
+  barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+  barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.image = dest->image;
+  barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
+  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                       nullptr, 1, &barrier);
+
+  // Now move the converted texture into the destination.
+  VkBufferImageCopy copy_region;
+  copy_region.bufferOffset = alloc->offset;
+  copy_region.bufferRowLength = src.size_cube.output_width;
+  copy_region.bufferImageHeight = src.size_cube.output_height;
+  copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
+  copy_region.imageOffset = {0, 0, 0};
+  copy_region.imageExtent = {src.size_cube.output_width,
+                             src.size_cube.output_height, 6};
+  vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(),
+                         dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1,
+                         &copy_region);
+
+  // Now transition the texture into a shader readonly source.
+  barrier.srcAccessMask = barrier.dstAccessMask;
+  barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+  barrier.oldLayout = barrier.newLayout;
+  barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
+                       nullptr, 1, &barrier);
+
+  dest->image_layout = barrier.newLayout;
+  return true;
 }
 
 VkDescriptorSet TextureCache::PrepareTextureSet(
diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h
index aed25ad86..732dd493f 100644
--- a/src/xenia/gpu/vulkan/texture_cache.h
+++ b/src/xenia/gpu/vulkan/texture_cache.h
@@ -146,16 +146,23 @@ class TextureCache {
   TextureView* DemandView(Texture* texture, uint16_t swizzle);
   Sampler* Demand(const SamplerInfo& sampler_info);
 
+  void FlushPendingCommands(
+      VkCommandBuffer command_buffer,
+      std::shared_ptr<ui::vulkan::Fence> completion_fence);
+
+  void ConvertTexture2D(uint8_t* dest, const TextureInfo& src);
+  void ConvertTextureCube(uint8_t* dest, const TextureInfo& src);
+
   // Queues commands to upload a texture from system memory, applying any
   // conversions necessary. This may flush the command buffer to the GPU if we
   // run out of staging memory.
   bool UploadTexture2D(VkCommandBuffer command_buffer,
                        std::shared_ptr<ui::vulkan::Fence> completion_fence,
-                       Texture* dest, TextureInfo src);
+                       Texture* dest, const TextureInfo& src);
 
   bool UploadTextureCube(VkCommandBuffer command_buffer,
                          std::shared_ptr<ui::vulkan::Fence> completion_fence,
-                         Texture* dest, TextureInfo src);
+                         Texture* dest, const TextureInfo& src);
 
   bool SetupTextureBindings(
       VkCommandBuffer command_buffer,
@@ -172,6 +179,7 @@ class TextureCache {
   RegisterFile* register_file_ = nullptr;
   TraceWriter* trace_writer_ = nullptr;
   ui::vulkan::VulkanDevice* device_ = nullptr;
+  VkQueue device_queue_ = nullptr;
 
   VkDescriptorPool descriptor_pool_ = nullptr;
   VkDescriptorSetLayout texture_descriptor_set_layout_ = nullptr;