From 99d71fdc2a362e3af0e53b791ce88a8556353aad Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 5 Nov 2019 17:00:07 +0300 Subject: [PATCH] vk: Implement layer batching for the GPU swizzle decoder - Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes. - Handles all LODs of a 3D texture in one pass as well. - The improvements do warrant dropping down the number of allowed compute invocations a bit --- rpcs3/Emu/RSX/VK/VKCompute.h | 91 ++++++++++++++++++++++++++-------- rpcs3/Emu/RSX/VK/VKTexture.cpp | 76 +++++++++++++++++++++++++--- 2 files changed, 138 insertions(+), 29 deletions(-) diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index 786448c619..da06182858 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -2,7 +2,7 @@ #include "VKHelpers.h" #include "Utilities/StrUtil.h" -#define VK_MAX_COMPUTE_TASKS 32768 // Max number of jobs per frame +#define VK_MAX_COMPUTE_TASKS 4096 // Max number of jobs per frame namespace vk { @@ -539,7 +539,7 @@ namespace vk // Reverse morton-order block arrangement struct cs_deswizzle_base : compute_task { - virtual void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 width, u32 height, u32 depth) = 0; + virtual void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 data_length, u32 width, u32 height, u32 depth, u32 mipmaps) = 0; }; template @@ -547,7 +547,7 @@ namespace vk { union params_t { - u32 data[6]; + u32 data[7]; struct { @@ -557,6 +557,7 @@ namespace vk u32 logw; u32 logh; u32 logd; + u32 mipmaps; }; } params; @@ -573,13 +574,13 @@ namespace vk ssbo_count = 2; use_push_constants = true; - push_constants_size = 24; + push_constants_size = 28; create(); m_src = "#version 450\n" - "layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;\n\n" + "layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n" "layout(set=0, binding=0, std430) buffer ssbo0{ uint data_in[]; };\n" "layout(set=0, binding=1, std430) buffer ssbo1{ uint data_out[]; };\n" @@ -591,10 +592,46 @@ namespace vk " uint image_logw;\n" " uint image_logh;\n" " uint image_logd;\n" + " uint lod_count;\n" + "};\n\n" + + "struct invocation_properties\n" + "{\n" + " uint data_offset;\n" + " uvec3 size;\n" + " uvec3 size_log2;\n" "};\n\n" "#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n" - "#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n" + "#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n\n" + + "invocation_properties invocation;\n\n" + + "bool init_invocation_properties(const in uint offset)\n" + "{\n" + " invocation.data_offset = 0;\n" + " invocation.size.x = image_width;\n" + " invocation.size.y = image_height;\n" + " invocation.size.z = image_depth;\n" + " invocation.size_log2.x = image_logw;\n" + " invocation.size_log2.y = image_logh;\n" + " invocation.size_log2.z = image_logd;\n" + " uint level_end = image_width * image_height * image_depth;\n" + " uint level = 1;\n\n" + + " while (offset >= level_end && level < lod_count)\n" + " {\n" + " invocation.data_offset = level_end;\n" + " invocation.size.xy /= 2;\n" + " invocation.size.xy = max(invocation.size.xy, uvec2(1));\n" + " invocation.size_log2.xy = max(invocation.size_log2.xy, uvec2(1));\n" + " invocation.size_log2.xy --;\n" + " level_end += (invocation.size.x * invocation.size.y * image_depth);\n" + " level++;" + " }\n\n" + + " return (offset < level_end);\n" + "}\n\n" "uint get_z_index(const in uint x_, const in uint y_, const in uint z_)\n" "{\n" @@ -603,9 +640,9 @@ namespace vk " uint x = x_;\n" " uint y = y_;\n" " uint z = z_;\n" - " uint log2w = image_logw;\n" - " uint log2h = image_logh;\n" - " uint log2d = image_logd;\n" + " uint log2w = invocation.size_log2.x;\n" + " uint log2h = invocation.size_log2.y;\n" + " uint log2d = invocation.size_log2.z;\n" "\n" " do\n" " {\n" @@ -640,15 +677,25 @@ namespace vk "void main()\n" "{\n" - " if (any(greaterThanEqual(gl_GlobalInvocationID, uvec3(image_width, image_height, image_depth))))\n" + " uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);" + " uint texel_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n" + " uint word_count = %_wordcount;\n\n" + + " if (!init_invocation_properties(texel_id))\n" " return;\n\n" - " uint texel_id = (gl_GlobalInvocationID.z * image_width * image_height) + (gl_GlobalInvocationID.y * image_width) + gl_GlobalInvocationID.x;\n" - " uint word_count = %_wordcount;\n" - " uint dst_id = (texel_id * word_count);\n\n" + " // Calculations done in texels, not bytes\n" + " uint row_length = invocation.size.x;\n" + " uint slice_length = (invocation.size.y * row_length);\n" + " uint level_offset = (texel_id - invocation.data_offset);\n" + " uint slice_offset = (level_offset % slice_length);\n" + " uint z = (level_offset / slice_length);\n" + " uint y = (slice_offset / row_length);\n" + " uint x = (slice_offset % row_length);\n\n" - " uint src_id = get_z_index(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y, gl_GlobalInvocationID.z);\n" - " src_id *= word_count;\n\n" + " uint src_texel_id = get_z_index(x, y, z);\n" + " uint dst_id = (texel_id * word_count);\n" + " uint src_id = (src_texel_id + invocation.data_offset) * word_count;\n\n" " for (uint i = 0; i < word_count; ++i)\n" " {\n" @@ -677,6 +724,7 @@ namespace vk const std::pair syntax_replace[] = { + { "%ws", std::to_string(optimal_group_size) }, { "%_wordcount", std::to_string(sizeof(_BlockType) / 4) }, { "%f", transform } }; @@ -692,29 +740,30 @@ namespace vk void set_parameters(VkCommandBuffer cmd) { - vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, 24, params.data); + vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, push_constants_size, params.data); } - void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 width, u32 height, u32 depth) override + void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 data_length, u32 width, u32 height, u32 depth, u32 mipmaps) override { dst_buffer = dst; src_buffer = src; this->in_offset = in_offset; this->out_offset = out_offset; - this->block_length = sizeof(_BlockType) * width * height * depth; + this->block_length = data_length; params.width = width; params.height = height; params.depth = depth; + params.mipmaps = mipmaps; params.logw = rsx::ceil_log2(width); params.logh = rsx::ceil_log2(height); params.logd = rsx::ceil_log2(depth); set_parameters(cmd); - const u32 invocations_x = align(params.width, 8) / 8; - const u32 invocations_y = align(params.height, 8) / 8; - compute_task::run(cmd, invocations_x, invocations_y, depth); + const u32 num_bytes_per_invocation = (4 * optimal_group_size); + const u32 linear_invocations = rsx::aligned_div(data_length, num_bytes_per_invocation); + compute_task::run(cmd, linear_invocations); } }; diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index 4fe7944db3..f7b661e0b7 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -609,14 +609,59 @@ namespace vk verify(HERE), job; - for (auto §ion : sections) - { - job->run(cmd, scratch_buf, dst_offset, scratch_buf, section.bufferOffset, - section.imageExtent.width, section.imageExtent.height, section.imageExtent.depth); + auto next_layer = sections.front().imageSubresource.baseArrayLayer; + auto next_level = sections.front().imageSubresource.mipLevel; + unsigned base = 0; + unsigned lods = 0; - const u32 packed_size = section.imageExtent.width * section.imageExtent.height * section.imageExtent.depth * block_size; - section.bufferOffset = dst_offset; - dst_offset += packed_size; + std::vector> packets; + for (unsigned i = 0; i < sections.size(); ++i) + { + verify(HERE), sections[i].bufferRowLength; + + const auto layer = sections[i].imageSubresource.baseArrayLayer; + const auto level = sections[i].imageSubresource.mipLevel; + + if (layer == next_layer && + level == next_level) + { + next_level++; + lods++; + continue; + } + + packets.push_back({base, lods }); + next_layer = layer; + next_level = 1; + base = i; + lods = 1; + } + + if (packets.empty() || + (packets.back().first + packets.back().second) < sections.size()) + { + packets.push_back({base, lods}); + } + + for (const auto &packet : packets) + { + const auto& section = sections[packet.first]; + const auto src_offset = section.bufferOffset; + + // Align output to 128-byte boundary to keep some drivers happy + dst_offset = align(dst_offset, 128); + + u32 data_length = 0; + for (unsigned i = 0, j = packet.first; i < packet.second; ++i, ++j) + { + const u32 packed_size = sections[j].imageExtent.width * sections[j].imageExtent.height * sections[j].imageExtent.depth * block_size; + sections[j].bufferOffset = dst_offset; + dst_offset += packed_size; + data_length += packed_size; + } + + job->run(cmd, scratch_buf, section.bufferOffset, scratch_buf, src_offset, data_length, + section.imageExtent.width, section.imageExtent.height, section.imageExtent.depth, packet.second); } verify(HERE), dst_offset <= scratch_buf->size(); @@ -645,7 +690,16 @@ namespace vk { if (LIKELY(!heap_align)) { - row_pitch = (layout.pitch_in_block * block_size_in_bytes); + if (LIKELY(!layout.border)) + { + row_pitch = (layout.pitch_in_block * block_size_in_bytes); + } + else + { + // Skip the border texels if possible. Padding is undesirable for GPU deswizzle + row_pitch = (layout.width_in_block * block_size_in_bytes); + } + caps.alignment = row_pitch; } else @@ -692,6 +746,12 @@ namespace vk buffer_copies.reserve(subresource_layout.size()); } + if (layout.level == 0) + { + // Align mip0 on a 128-byte boundary + scratch_offset = align(scratch_offset, 128); + } + // Copy from upload heap to scratch mem buffer_copies.push_back({}); auto& copy = buffer_copies.back();