vk: Tuning [WIP]

- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum
This commit is contained in:
kd-11 2018-06-23 15:15:55 +03:00 committed by kd-11
parent d484253136
commit bda65f93a6
4 changed files with 118 additions and 26 deletions

View File

@ -16,7 +16,9 @@ namespace vk
u32 m_used_descriptors = 0;
bool initialized = false;
u32 optimal_group_size = 64;
bool unroll_loops = true;
u32 optimal_group_size = 1;
u32 optimal_kernel_size = 1;
void init_descriptors()
{
@ -62,7 +64,15 @@ namespace vk
case vk::driver_vendor::unknown:
// Probably intel
case vk::driver_vendor::NVIDIA:
unroll_loops = true;
optimal_group_size = 32;
optimal_kernel_size = 16;
break;
case vk::driver_vendor::AMD:
case vk::driver_vendor::RADV:
unroll_loops = false;
optimal_kernel_size = 1;
optimal_group_size = 64;
break;
}
@ -155,9 +165,12 @@ namespace vk
u32 m_data_length = 0;
u32 kernel_size = 1;
void build(const char* function_name, u32 _kernel_size)
void build(const char* function_name, u32 _kernel_size = 0)
{
kernel_size = _kernel_size;
// Initialize to allow detecting optimal settings
create();
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
m_src =
{
@ -180,12 +193,23 @@ namespace vk
"void main()\n"
"{\n"
" uint index = gl_GlobalInvocationID.x * KERNEL_SIZE;\n"
" for (uint loop = 0; loop < KERNEL_SIZE; ++loop)\n"
" {\n"
" uint value = data[index];\n"
" uint value;\n"
"\n"
};
std::string work_kernel =
{
" value = data[index];\n"
" data[index] = %f(value);\n"
};
std::string loop_advance =
{
" index++;\n"
" }\n"
};
const std::string suffix =
{
"}\n"
};
@ -197,6 +221,40 @@ namespace vk
};
m_src = fmt::replace_all(m_src, syntax_replace);
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
if (kernel_size <= 1)
{
m_src += " {\n" + work_kernel + " }\n";
}
else if (unroll_loops)
{
work_kernel += loop_advance + "\n";
m_src += std::string
(
" //Unrolled loop\n"
" {\n"
);
// Assemble body with manual loop unroll to try loweing GPR usage
for (u32 n = 0; n < kernel_size; ++n)
{
m_src += work_kernel;
}
m_src += " }\n";
}
else
{
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
m_src += " {\n";
m_src += work_kernel;
m_src += loop_advance;
m_src += " }\n";
}
m_src += suffix;
}
void bind_resources() override
@ -221,7 +279,7 @@ namespace vk
// byteswap ushort
cs_shuffle_16()
{
cs_shuffle_base::build("bswap_u16", 32);
cs_shuffle_base::build("bswap_u16");
}
};
@ -230,7 +288,7 @@ namespace vk
// byteswap_ulong
cs_shuffle_32()
{
cs_shuffle_base::build("bswap_u32", 32);
cs_shuffle_base::build("bswap_u32");
}
};
@ -239,7 +297,7 @@ namespace vk
// byteswap_ulong + byteswap_ushort
cs_shuffle_32_16()
{
cs_shuffle_base::build("bswap_u16_u32", 32);
cs_shuffle_base::build("bswap_u16_u32");
}
};
@ -248,7 +306,7 @@ namespace vk
// convert d24x8 to f32
cs_shuffle_d24x8_f32()
{
cs_shuffle_base::build("d24x8_to_f32", 32);
cs_shuffle_base::build("d24x8_to_f32");
}
};
@ -257,7 +315,7 @@ namespace vk
// convert f32 to d24x8 and swap endianness
cs_shuffle_se_f32_d24x8()
{
cs_shuffle_base::build("f32_to_d24x8_swapped", 32);
cs_shuffle_base::build("f32_to_d24x8_swapped");
}
};
@ -266,7 +324,7 @@ namespace vk
// swap endianness of d24x8
cs_shuffle_se_d24x8()
{
cs_shuffle_base::build("d24x8_to_d24x8_swapped", 32);
cs_shuffle_base::build("d24x8_to_d24x8_swapped");
}
};

View File

@ -472,11 +472,13 @@ namespace vk
//Currently we require:
//1. Anisotropic sampling
//2. DXT support
//3. Indexable storage buffers
VkPhysicalDeviceFeatures available_features;
vkGetPhysicalDeviceFeatures(*pgpu, &available_features);
available_features.samplerAnisotropy = VK_TRUE;
available_features.textureCompressionBC = VK_TRUE;
available_features.shaderStorageBufferArrayDynamicIndexing = VK_TRUE;
VkDeviceCreateInfo device = {};
device.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;

View File

@ -445,6 +445,7 @@ namespace vk
size_t offset_in_buffer = upload_heap.alloc<512>(image_linear_size + 8);
void *mapped_buffer = upload_heap.map(offset_in_buffer, image_linear_size + 8);
void *dst = mapped_buffer;
VkBuffer buffer_handle = upload_heap.heap->value;
if (dst_image->info.format == VK_FORMAT_D24_UNORM_S8_UINT)
{
@ -466,10 +467,26 @@ namespace vk
// NOTE: On commandbuffer submission, the HOST_WRITE to ALL_COMMANDS barrier is implicitly inserted according to spec
// No need to add another explicit barrier unless a driver bug is found
// Executing GPU tasks on host_visible RAM is awful, copy to device-local buffer instead
auto scratch_buf = vk::get_scratch_buffer();
VkBufferCopy copy = {};
copy.srcOffset = offset_in_buffer;
copy.dstOffset = 0;
copy.size = image_linear_size;
vkCmdCopyBuffer(cmd, buffer_handle, scratch_buf->value, 1, &copy);
insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, image_linear_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
vk::get_compute_task<vk::cs_shuffle_d24x8_f32>()->run(cmd, upload_heap.heap.get(), image_linear_size, offset_in_buffer);
insert_buffer_memory_barrier(cmd, upload_heap.heap->value, offset_in_buffer, image_linear_size, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, image_linear_size, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
buffer_handle = scratch_buf->value;
offset_in_buffer = 0;
}
VkBufferImageCopy copy_info = {};
@ -483,7 +500,7 @@ namespace vk
copy_info.imageSubresource.mipLevel = mipmap_level % mipmap_count;
copy_info.bufferRowLength = block_in_pixel * row_pitch / block_size_in_bytes;
vkCmdCopyBufferToImage(cmd, upload_heap.heap->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copy_info);
vkCmdCopyBufferToImage(cmd, buffer_handle, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copy_info);
mipmap_level++;
}
}

View File

@ -199,6 +199,8 @@ namespace vk
{
// TODO: Synchronize access to typeles textures
target = vk::get_typeless_helper(vram_texture->info.format);
change_image_layout(cmd, target, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, subresource_range);
vk::copy_scaled_image(cmd, vram_texture->value, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, target->current_layout,
0, 0, vram_texture->width(), vram_texture->height(), 0, 0, transfer_width, transfer_height, 1, aspect_flag, true, VK_FILTER_NEAREST,
vram_texture->info.format, target->info.format);
@ -212,15 +214,6 @@ namespace vk
change_image_layout(cmd, target, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, subresource_range);
}
// TODO: Read back stencil values (is this really necessary?)
VkBufferImageCopy region = {};
region.imageSubresource = {aspect_flag & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 0, 1};
region.imageExtent = {transfer_width, transfer_height, 1};
vkCmdCopyImageToBuffer(cmd, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dma_buffer->value, 1, &region);
change_image_layout(cmd, vram_texture, old_layout, subresource_range);
real_pitch = vk::get_format_texel_width(vram_texture->info.format) * transfer_width;
// Handle any format conversions using compute tasks
vk::cs_shuffle_base *shuffle_kernel = nullptr;
@ -247,13 +240,35 @@ namespace vk
}
}
// Do not run the compute task on host visible memory
vk::buffer* mem_target = shuffle_kernel ? vk::get_scratch_buffer() : dma_buffer.get();
// TODO: Read back stencil values (is this really necessary?)
VkBufferImageCopy region = {};
region.imageSubresource = {aspect_flag & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 0, 1};
region.imageExtent = {transfer_width, transfer_height, 1};
vkCmdCopyImageToBuffer(cmd, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, mem_target->value, 1, &region);
change_image_layout(cmd, vram_texture, old_layout, subresource_range);
real_pitch = vk::get_format_texel_width(vram_texture->info.format) * transfer_width;
if (shuffle_kernel)
{
vk::insert_buffer_memory_barrier(cmd, dma_buffer->value, 0, cpu_address_range,
verify (HERE), mem_target->value != dma_buffer->value;
vk::insert_buffer_memory_barrier(cmd, mem_target->value, 0, cpu_address_range,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
shuffle_kernel->run(cmd, dma_buffer.get(), cpu_address_range);
shuffle_kernel->run(cmd, mem_target, cpu_address_range);
vk::insert_buffer_memory_barrier(cmd, mem_target->value, 0, cpu_address_range,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
VkBufferCopy copy = {};
copy.size = cpu_address_range;
vkCmdCopyBuffer(cmd, mem_target->value, dma_buffer->value, 1, &copy);
}
if (manage_cb_lifetime)