From ff57d797932582945955691d289ee41245280a71 Mon Sep 17 00:00:00 2001 From: "Dr. Chat" Date: Sat, 30 Jul 2016 23:02:06 -0500 Subject: [PATCH] TextureCache: Flush uploads if the staging buffer overflows in a frame. --- src/xenia/gpu/vulkan/texture_cache.cc | 269 ++++++++++++++++++++++---- src/xenia/gpu/vulkan/texture_cache.h | 12 +- 2 files changed, 244 insertions(+), 37 deletions(-) diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc index 0e557e2cc..b95f3a1ce 100644 --- a/src/xenia/gpu/vulkan/texture_cache.cc +++ b/src/xenia/gpu/vulkan/texture_cache.cc @@ -25,7 +25,7 @@ namespace vulkan { using xe::ui::vulkan::CheckResult; constexpr uint32_t kMaxTextureSamplers = 32; -constexpr VkDeviceSize kStagingBufferSize = 64 * 1024 * 1024; +constexpr VkDeviceSize kStagingBufferSize = 32 * 1024 * 1024; struct TextureConfig { TextureFormat guest_format; @@ -118,10 +118,10 @@ TextureCache::TextureCache(Memory* memory, RegisterFile* register_file, descriptor_pool_info.pNext = nullptr; descriptor_pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; - descriptor_pool_info.maxSets = 8192; + descriptor_pool_info.maxSets = 32768; VkDescriptorPoolSize pool_sizes[1]; pool_sizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; - pool_sizes[0].descriptorCount = 8192; + pool_sizes[0].descriptorCount = 32768; descriptor_pool_info.poolSizeCount = 1; descriptor_pool_info.pPoolSizes = pool_sizes; auto err = vkCreateDescriptorPool(*device_, &descriptor_pool_info, nullptr, @@ -160,9 +160,15 @@ TextureCache::TextureCache(Memory* memory, RegisterFile* register_file, invalidated_textures_sets_[0].reserve(64); invalidated_textures_sets_[1].reserve(64); invalidated_textures_ = &invalidated_textures_sets_[0]; + + device_queue_ = device_->AcquireQueue(); } TextureCache::~TextureCache() { + if (device_queue_) { + device_->ReleaseQueue(device_queue_); + } + for (auto it = samplers_.begin(); it != samplers_.end(); ++it) { vkDestroySampler(*device_, it->second->sampler, nullptr); delete it->second; @@ -204,26 +210,35 @@ TextureCache::Texture* TextureCache::AllocateTexture( ? config.host_format : VK_FORMAT_R8G8B8A8_UNORM; + image_info.tiling = VK_IMAGE_TILING_OPTIMAL; + image_info.usage = VK_IMAGE_USAGE_SAMPLED_BIT | + VK_IMAGE_USAGE_TRANSFER_SRC_BIT | + VK_IMAGE_USAGE_TRANSFER_DST_BIT; + + // Check the device limits for the format before we create it. VkFormatProperties props; + vkGetPhysicalDeviceFormatProperties(*device_, format, &props); uint32_t required_flags = VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT | VK_FORMAT_FEATURE_BLIT_SRC_BIT; - vkGetPhysicalDeviceFormatProperties(*device_, format, &props); if ((props.optimalTilingFeatures & required_flags) != required_flags) { // Texture needs conversion on upload to a native format. // assert_always(); } + VkImageFormatProperties image_props; + vkGetPhysicalDeviceImageFormatProperties( + *device_, format, image_info.imageType, image_info.tiling, + image_info.usage, image_info.flags, &image_props); + + // TODO(DrChat): Actually check the image properties. + image_info.format = format; image_info.extent = {texture_info.width + 1, texture_info.height + 1, texture_info.depth + 1}; image_info.mipLevels = 1; image_info.arrayLayers = 1; image_info.samples = VK_SAMPLE_COUNT_1_BIT; - image_info.tiling = VK_IMAGE_TILING_OPTIMAL; - image_info.usage = VK_IMAGE_USAGE_SAMPLED_BIT | - VK_IMAGE_USAGE_TRANSFER_SRC_BIT | - VK_IMAGE_USAGE_TRANSFER_DST_BIT; image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; image_info.queueFamilyIndexCount = 0; image_info.pQueueFamilyIndices = nullptr; @@ -257,6 +272,7 @@ TextureCache::Texture* TextureCache::AllocateTexture( texture->image_memory = memory; texture->memory_offset = 0; texture->memory_size = mem_requirements.size; + texture->access_watch_handle = 0; texture->texture_info = texture_info; return texture; } @@ -380,7 +396,7 @@ TextureCache::Texture* TextureCache::Demand( } } - if (!command_buffer || texture_info.dimension != Dimension::k2D) { + if (!command_buffer) { // Texture not found and no command buffer was passed, preventing us from // uploading a new one. return nullptr; @@ -402,6 +418,8 @@ TextureCache::Texture* TextureCache::Demand( } break; case Dimension::kCube: { + uploaded = UploadTextureCube(command_buffer, completion_fence, texture, + texture_info); } break; default: @@ -515,6 +533,10 @@ TextureCache::TextureView* TextureCache::DemandView(Texture* texture, VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; } + if (texture->texture_info.dimension == Dimension::kCube) { + view_info.subresourceRange.layerCount = 6; + } + VkImageView view; auto status = vkCreateImageView(*device_, &view_info, nullptr, &view); CheckResult(status, "vkCreateImageView"); @@ -748,44 +770,55 @@ void TextureSwap(Endian endianness, void* dest, const void* src, } } -bool TextureCache::UploadTexture2D( +void TextureCache::FlushPendingCommands( VkCommandBuffer command_buffer, - std::shared_ptr completion_fence, Texture* dest, - TextureInfo src) { -#if FINE_GRAINED_DRAW_SCOPES - SCOPE_profile_cpu_f("gpu"); -#endif // FINE_GRAINED_DRAW_SCOPES + std::shared_ptr completion_fence) { + auto status = vkEndCommandBuffer(command_buffer); + CheckResult(status, "vkEndCommandBuffer"); - assert_true(src.dimension == Dimension::k2D); + VkSubmitInfo submit_info; + std::memset(&submit_info, 0, sizeof(submit_info)); + submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &command_buffer; - size_t unpack_length = src.output_length; - if (!staging_buffer_.CanAcquire(unpack_length)) { - // Need to have unique memory for every upload for at least one frame. If we - // run out of memory, we need to flush all queued upload commands to the - // GPU. - // TODO: Actually flush commands. - assert_always(); + if (device_queue_) { + auto status = + vkQueueSubmit(device_queue_, 1, &submit_info, *completion_fence); + CheckResult(status, "vkQueueSubmit"); + } else { + std::lock_guard(device_->primary_queue_mutex()); + + auto status = vkQueueSubmit(device_->primary_queue(), 1, &submit_info, + *completion_fence); + CheckResult(status, "vkQueueSubmit"); } - // Grab some temporary memory for staging. - auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence); - assert_not_null(alloc); + VkFence fences[] = {*completion_fence}; + vkWaitForFences(*device_, 1, fences, VK_TRUE, -1); + staging_buffer_.Scavenge(); + vkResetFences(*device_, 1, fences); - // Upload texture into GPU memory. - // TODO: If the GPU supports it, we can submit a compute batch to convert the - // texture and copy it to its destination. Otherwise, fallback to conversion - // on the CPU. + // Reset the command buffer and put it back into the recording state. + vkResetCommandBuffer(command_buffer, 0); + VkCommandBufferBeginInfo begin_info; + std::memset(&begin_info, 0, sizeof(begin_info)); + begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + vkBeginCommandBuffer(command_buffer, &begin_info); +} + +void TextureCache::ConvertTexture2D(uint8_t* dest, const TextureInfo& src) { void* host_address = memory_->TranslatePhysical(src.guest_address); if (!src.is_tiled) { if (src.size_2d.input_pitch == src.size_2d.output_pitch) { // Fast path copy entire image. - TextureSwap(src.endianness, alloc->host_ptr, host_address, unpack_length); + TextureSwap(src.endianness, dest, host_address, src.output_length); } else { // Slow path copy row-by-row because strides differ. // UNPACK_ROW_LENGTH only works for uncompressed images, and likely does // this exact thing under the covers, so we just always do it here. const uint8_t* src_mem = reinterpret_cast(host_address); - uint8_t* dest = reinterpret_cast(alloc->host_ptr); uint32_t pitch = std::min(src.size_2d.input_pitch, src.size_2d.output_pitch); for (uint32_t y = 0; @@ -802,7 +835,6 @@ bool TextureCache::UploadTexture2D( // TODO(benvanik): optimize this inner loop (or work by tiles). const uint8_t* src_mem = reinterpret_cast(host_address); - uint8_t* dest = reinterpret_cast(alloc->host_ptr); uint32_t bytes_per_block = src.format_info->block_width * src.format_info->block_height * src.format_info->bits_per_pixel / 8; @@ -830,7 +862,103 @@ bool TextureCache::UploadTexture2D( } } } +} +void TextureCache::ConvertTextureCube(uint8_t* dest, const TextureInfo& src) { + void* host_address = memory_->TranslatePhysical(src.guest_address); + if (!src.is_tiled) { + if (src.size_cube.input_pitch == src.size_cube.output_pitch) { + // Fast path copy entire image. + TextureSwap(src.endianness, dest, host_address, src.output_length); + } else { + // Slow path copy row-by-row because strides differ. + // UNPACK_ROW_LENGTH only works for uncompressed images, and likely does + // this exact thing under the covers, so we just always do it here. + const uint8_t* src_mem = reinterpret_cast(host_address); + for (int face = 0; face < 6; ++face) { + uint32_t pitch = + std::min(src.size_cube.input_pitch, src.size_cube.output_pitch); + for (uint32_t y = 0; y < src.size_cube.block_height; y++) { + TextureSwap(src.endianness, dest, src_mem, pitch); + src_mem += src.size_cube.input_pitch; + dest += src.size_cube.output_pitch; + } + } + } + } else { + // TODO(benvanik): optimize this inner loop (or work by tiles). + const uint8_t* src_mem = reinterpret_cast(host_address); + uint32_t bytes_per_block = src.format_info->block_width * + src.format_info->block_height * + src.format_info->bits_per_pixel / 8; + // Tiled textures can be packed; get the offset into the packed texture. + uint32_t offset_x; + uint32_t offset_y; + TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); + auto bpp = (bytes_per_block >> 2) + + ((bytes_per_block >> 1) >> (bytes_per_block >> 2)); + for (int face = 0; face < 6; ++face) { + for (uint32_t y = 0, output_base_offset = 0; + y < src.size_cube.block_height; + y++, output_base_offset += src.size_cube.output_pitch) { + auto input_base_offset = TextureInfo::TiledOffset2DOuter( + offset_y + y, + (src.size_cube.input_width / src.format_info->block_width), bpp); + for (uint32_t x = 0, output_offset = output_base_offset; + x < src.size_cube.block_width; + x++, output_offset += bytes_per_block) { + auto input_offset = + TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp, + input_base_offset) >> + bpp; + TextureSwap(src.endianness, dest + output_offset, + src_mem + input_offset * bytes_per_block, + bytes_per_block); + } + } + src_mem += src.size_cube.input_face_length; + dest += src.size_cube.output_face_length; + } + } +} + +bool TextureCache::UploadTexture2D( + VkCommandBuffer command_buffer, + std::shared_ptr completion_fence, Texture* dest, + const TextureInfo& src) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + + assert_true(src.dimension == Dimension::k2D); + + size_t unpack_length = src.output_length; + if (!staging_buffer_.CanAcquire(unpack_length)) { + // Need to have unique memory for every upload for at least one frame. If we + // run out of memory, we need to flush all queued upload commands to the + // GPU. + FlushPendingCommands(command_buffer, completion_fence); + + // Uploads have been flushed. Continue. + if (!staging_buffer_.CanAcquire(unpack_length)) { + // The staging buffer isn't big enough to hold this texture. + XELOGE( + "TextureCache staging buffer is too small! (uploading 0x%.8X bytes)", + unpack_length); + assert_always(); + return false; + } + } + + // Grab some temporary memory for staging. + auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence); + assert_not_null(alloc); + + // Upload texture into GPU memory. + // TODO: If the GPU supports it, we can submit a compute batch to convert the + // texture and copy it to its destination. Otherwise, fallback to conversion + // on the CPU. + ConvertTexture2D(reinterpret_cast(alloc->host_ptr), src); staging_buffer_.Flush(alloc); // Transition the texture into a transfer destination layout. @@ -879,8 +1007,79 @@ bool TextureCache::UploadTexture2D( bool TextureCache::UploadTextureCube( VkCommandBuffer command_buffer, std::shared_ptr completion_fence, Texture* dest, - TextureInfo src) { - return false; + const TextureInfo& src) { + assert_true(src.dimension == Dimension::kCube); + + size_t unpack_length = src.output_length; + if (!staging_buffer_.CanAcquire(unpack_length)) { + // Need to have unique memory for every upload for at least one frame. If we + // run out of memory, we need to flush all queued upload commands to the + // GPU. + FlushPendingCommands(command_buffer, completion_fence); + + // Uploads have been flushed. Continue. + if (!staging_buffer_.CanAcquire(unpack_length)) { + // The staging buffer isn't big enough to hold this texture. + XELOGE( + "TextureCache staging buffer is too small! (uploading 0x%.8X bytes)", + unpack_length); + assert_always(); + return false; + } + } + + // Grab some temporary memory for staging. + auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence); + assert_not_null(alloc); + + // Upload texture into GPU memory. + // TODO: If the GPU supports it, we can submit a compute batch to convert the + // texture and copy it to its destination. Otherwise, fallback to conversion + // on the CPU. + ConvertTextureCube(reinterpret_cast(alloc->host_ptr), src); + staging_buffer_.Flush(alloc); + + // Transition the texture into a transfer destination layout. + VkImageMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.pNext = nullptr; + barrier.srcAccessMask = 0; + barrier.dstAccessMask = + VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT; + barrier.oldLayout = dest->image_layout; + barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = dest->image; + barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &barrier); + + // Now move the converted texture into the destination. + VkBufferImageCopy copy_region; + copy_region.bufferOffset = alloc->offset; + copy_region.bufferRowLength = src.size_cube.output_width; + copy_region.bufferImageHeight = src.size_cube.output_height; + copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; + copy_region.imageOffset = {0, 0, 0}; + copy_region.imageExtent = {src.size_cube.output_width, + src.size_cube.output_height, 6}; + vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(), + dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, + ©_region); + + // Now transition the texture into a shader readonly source. + barrier.srcAccessMask = barrier.dstAccessMask; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.oldLayout = barrier.newLayout; + barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &barrier); + + dest->image_layout = barrier.newLayout; + return true; } VkDescriptorSet TextureCache::PrepareTextureSet( diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h index aed25ad86..732dd493f 100644 --- a/src/xenia/gpu/vulkan/texture_cache.h +++ b/src/xenia/gpu/vulkan/texture_cache.h @@ -146,16 +146,23 @@ class TextureCache { TextureView* DemandView(Texture* texture, uint16_t swizzle); Sampler* Demand(const SamplerInfo& sampler_info); + void FlushPendingCommands( + VkCommandBuffer command_buffer, + std::shared_ptr completion_fence); + + void ConvertTexture2D(uint8_t* dest, const TextureInfo& src); + void ConvertTextureCube(uint8_t* dest, const TextureInfo& src); + // Queues commands to upload a texture from system memory, applying any // conversions necessary. This may flush the command buffer to the GPU if we // run out of staging memory. bool UploadTexture2D(VkCommandBuffer command_buffer, std::shared_ptr completion_fence, - Texture* dest, TextureInfo src); + Texture* dest, const TextureInfo& src); bool UploadTextureCube(VkCommandBuffer command_buffer, std::shared_ptr completion_fence, - Texture* dest, TextureInfo src); + Texture* dest, const TextureInfo& src); bool SetupTextureBindings( VkCommandBuffer command_buffer, @@ -172,6 +179,7 @@ class TextureCache { RegisterFile* register_file_ = nullptr; TraceWriter* trace_writer_ = nullptr; ui::vulkan::VulkanDevice* device_ = nullptr; + VkQueue device_queue_ = nullptr; VkDescriptorPool descriptor_pool_ = nullptr; VkDescriptorSetLayout texture_descriptor_set_layout_ = nullptr;