diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index bceba61a28..74c4c4da06 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -16,7 +16,9 @@ namespace vk u32 m_used_descriptors = 0; bool initialized = false; - u32 optimal_group_size = 64; + bool unroll_loops = true; + u32 optimal_group_size = 1; + u32 optimal_kernel_size = 1; void init_descriptors() { @@ -62,7 +64,15 @@ namespace vk case vk::driver_vendor::unknown: // Probably intel case vk::driver_vendor::NVIDIA: + unroll_loops = true; optimal_group_size = 32; + optimal_kernel_size = 16; + break; + case vk::driver_vendor::AMD: + case vk::driver_vendor::RADV: + unroll_loops = false; + optimal_kernel_size = 1; + optimal_group_size = 64; break; } @@ -155,9 +165,12 @@ namespace vk u32 m_data_length = 0; u32 kernel_size = 1; - void build(const char* function_name, u32 _kernel_size) + void build(const char* function_name, u32 _kernel_size = 0) { - kernel_size = _kernel_size; + // Initialize to allow detecting optimal settings + create(); + + kernel_size = _kernel_size? _kernel_size : optimal_kernel_size; m_src = { @@ -180,12 +193,23 @@ namespace vk "void main()\n" "{\n" " uint index = gl_GlobalInvocationID.x * KERNEL_SIZE;\n" - " for (uint loop = 0; loop < KERNEL_SIZE; ++loop)\n" - " {\n" - " uint value = data[index];\n" + " uint value;\n" + "\n" + }; + + std::string work_kernel = + { + " value = data[index];\n" " data[index] = %f(value);\n" + }; + + std::string loop_advance = + { " index++;\n" - " }\n" + }; + + const std::string suffix = + { "}\n" }; @@ -197,6 +221,40 @@ namespace vk }; m_src = fmt::replace_all(m_src, syntax_replace); + work_kernel = fmt::replace_all(work_kernel, syntax_replace); + + if (kernel_size <= 1) + { + m_src += " {\n" + work_kernel + " }\n"; + } + else if (unroll_loops) + { + work_kernel += loop_advance + "\n"; + + m_src += std::string + ( + " //Unrolled loop\n" + " {\n" + ); + + // Assemble body with manual loop unroll to try loweing GPR usage + for (u32 n = 0; n < kernel_size; ++n) + { + m_src += work_kernel; + } + + m_src += " }\n"; + } + else + { + m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n"; + m_src += " {\n"; + m_src += work_kernel; + m_src += loop_advance; + m_src += " }\n"; + } + + m_src += suffix; } void bind_resources() override @@ -221,7 +279,7 @@ namespace vk // byteswap ushort cs_shuffle_16() { - cs_shuffle_base::build("bswap_u16", 32); + cs_shuffle_base::build("bswap_u16"); } }; @@ -230,7 +288,7 @@ namespace vk // byteswap_ulong cs_shuffle_32() { - cs_shuffle_base::build("bswap_u32", 32); + cs_shuffle_base::build("bswap_u32"); } }; @@ -239,7 +297,7 @@ namespace vk // byteswap_ulong + byteswap_ushort cs_shuffle_32_16() { - cs_shuffle_base::build("bswap_u16_u32", 32); + cs_shuffle_base::build("bswap_u16_u32"); } }; @@ -248,7 +306,7 @@ namespace vk // convert d24x8 to f32 cs_shuffle_d24x8_f32() { - cs_shuffle_base::build("d24x8_to_f32", 32); + cs_shuffle_base::build("d24x8_to_f32"); } }; @@ -257,7 +315,7 @@ namespace vk // convert f32 to d24x8 and swap endianness cs_shuffle_se_f32_d24x8() { - cs_shuffle_base::build("f32_to_d24x8_swapped", 32); + cs_shuffle_base::build("f32_to_d24x8_swapped"); } }; @@ -266,7 +324,7 @@ namespace vk // swap endianness of d24x8 cs_shuffle_se_d24x8() { - cs_shuffle_base::build("d24x8_to_d24x8_swapped", 32); + cs_shuffle_base::build("d24x8_to_d24x8_swapped"); } }; diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index 40fba6c272..3d971c2fba 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -472,11 +472,13 @@ namespace vk //Currently we require: //1. Anisotropic sampling //2. DXT support + //3. Indexable storage buffers VkPhysicalDeviceFeatures available_features; vkGetPhysicalDeviceFeatures(*pgpu, &available_features); available_features.samplerAnisotropy = VK_TRUE; available_features.textureCompressionBC = VK_TRUE; + available_features.shaderStorageBufferArrayDynamicIndexing = VK_TRUE; VkDeviceCreateInfo device = {}; device.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index fa534e007e..bc5426dc9c 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -445,6 +445,7 @@ namespace vk size_t offset_in_buffer = upload_heap.alloc<512>(image_linear_size + 8); void *mapped_buffer = upload_heap.map(offset_in_buffer, image_linear_size + 8); void *dst = mapped_buffer; + VkBuffer buffer_handle = upload_heap.heap->value; if (dst_image->info.format == VK_FORMAT_D24_UNORM_S8_UINT) { @@ -466,10 +467,26 @@ namespace vk // NOTE: On commandbuffer submission, the HOST_WRITE to ALL_COMMANDS barrier is implicitly inserted according to spec // No need to add another explicit barrier unless a driver bug is found + // Executing GPU tasks on host_visible RAM is awful, copy to device-local buffer instead + auto scratch_buf = vk::get_scratch_buffer(); + + VkBufferCopy copy = {}; + copy.srcOffset = offset_in_buffer; + copy.dstOffset = 0; + copy.size = image_linear_size; + + vkCmdCopyBuffer(cmd, buffer_handle, scratch_buf->value, 1, ©); + + insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, image_linear_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + vk::get_compute_task()->run(cmd, upload_heap.heap.get(), image_linear_size, offset_in_buffer); - insert_buffer_memory_barrier(cmd, upload_heap.heap->value, offset_in_buffer, image_linear_size, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, image_linear_size, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); + + buffer_handle = scratch_buf->value; + offset_in_buffer = 0; } VkBufferImageCopy copy_info = {}; @@ -483,7 +500,7 @@ namespace vk copy_info.imageSubresource.mipLevel = mipmap_level % mipmap_count; copy_info.bufferRowLength = block_in_pixel * row_pitch / block_size_in_bytes; - vkCmdCopyBufferToImage(cmd, upload_heap.heap->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©_info); + vkCmdCopyBufferToImage(cmd, buffer_handle, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©_info); mipmap_level++; } } diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index 74fad9a6e9..4fc746cf51 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -199,6 +199,8 @@ namespace vk { // TODO: Synchronize access to typeles textures target = vk::get_typeless_helper(vram_texture->info.format); + change_image_layout(cmd, target, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, subresource_range); + vk::copy_scaled_image(cmd, vram_texture->value, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, target->current_layout, 0, 0, vram_texture->width(), vram_texture->height(), 0, 0, transfer_width, transfer_height, 1, aspect_flag, true, VK_FILTER_NEAREST, vram_texture->info.format, target->info.format); @@ -212,15 +214,6 @@ namespace vk change_image_layout(cmd, target, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, subresource_range); } - // TODO: Read back stencil values (is this really necessary?) - VkBufferImageCopy region = {}; - region.imageSubresource = {aspect_flag & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 0, 1}; - region.imageExtent = {transfer_width, transfer_height, 1}; - vkCmdCopyImageToBuffer(cmd, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dma_buffer->value, 1, ®ion); - - change_image_layout(cmd, vram_texture, old_layout, subresource_range); - real_pitch = vk::get_format_texel_width(vram_texture->info.format) * transfer_width; - // Handle any format conversions using compute tasks vk::cs_shuffle_base *shuffle_kernel = nullptr; @@ -247,13 +240,35 @@ namespace vk } } + // Do not run the compute task on host visible memory + vk::buffer* mem_target = shuffle_kernel ? vk::get_scratch_buffer() : dma_buffer.get(); + + // TODO: Read back stencil values (is this really necessary?) + VkBufferImageCopy region = {}; + region.imageSubresource = {aspect_flag & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 0, 1}; + region.imageExtent = {transfer_width, transfer_height, 1}; + vkCmdCopyImageToBuffer(cmd, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, mem_target->value, 1, ®ion); + + change_image_layout(cmd, vram_texture, old_layout, subresource_range); + real_pitch = vk::get_format_texel_width(vram_texture->info.format) * transfer_width; + if (shuffle_kernel) { - vk::insert_buffer_memory_barrier(cmd, dma_buffer->value, 0, cpu_address_range, + verify (HERE), mem_target->value != dma_buffer->value; + + vk::insert_buffer_memory_barrier(cmd, mem_target->value, 0, cpu_address_range, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); - shuffle_kernel->run(cmd, dma_buffer.get(), cpu_address_range); + shuffle_kernel->run(cmd, mem_target, cpu_address_range); + + vk::insert_buffer_memory_barrier(cmd, mem_target->value, 0, cpu_address_range, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); + + VkBufferCopy copy = {}; + copy.size = cpu_address_range; + vkCmdCopyBuffer(cmd, mem_target->value, dma_buffer->value, 1, ©); } if (manage_cb_lifetime)