nv2a/vk: Add compute pipeline cache, scale workgroups

This commit is contained in:
Matt Borgerson 2024-07-26 17:21:02 -07:00 committed by mborgerson
parent 8e5a77c45d
commit bc46a9303d
2 changed files with 193 additions and 73 deletions

View File

@ -257,16 +257,26 @@ typedef struct PGRAPHVkDisplayState {
GLuint gl_texture_id;
} PGRAPHVkDisplayState;
typedef struct ComputePipelineKey {
VkFormat host_fmt;
bool pack;
int workgroup_size;
} ComputePipelineKey;
typedef struct ComputePipeline {
LruNode node;
ComputePipelineKey key;
VkPipeline pipeline;
} ComputePipeline;
typedef struct PGRAPHVkComputeState {
VkDescriptorPool descriptor_pool;
VkDescriptorSetLayout descriptor_set_layout;
VkDescriptorSet descriptor_sets[1024];
int descriptor_set_index;
VkPipelineLayout pipeline_layout;
VkPipeline pipeline_pack_d24s8;
VkPipeline pipeline_unpack_d24s8;
VkPipeline pipeline_pack_f32s8;
VkPipeline pipeline_unpack_f32s8;
Lru pipeline_cache;
ComputePipeline *pipeline_cache_entries;
} PGRAPHVkComputeState;
typedef struct PGRAPHVkState {

View File

@ -18,6 +18,8 @@
*/
#include "hw/xbox/nv2a/pgraph/pgraph.h"
#include "qemu/fast-hash.h"
#include "qemu/lru.h"
#include "renderer.h"
#include <vulkan/vulkan_core.h>
@ -28,14 +30,12 @@
// swizzle shader we will need more flexibility.
const char *pack_d24_unorm_s8_uint_to_z24s8_glsl =
"#version 450\n"
"layout(local_size_x = 256) in;\n"
"layout(push_constant) uniform PushConstants { uint width_in, width_out; };\n"
"layout(binding = 0) buffer DepthIn { uint depth_in[]; };\n"
"layout(binding = 1) buffer StencilIn { uint stencil_in[]; };\n"
"layout(binding = 2) buffer DepthStencilOut { uint depth_stencil_out[]; };\n"
"uint get_input_idx(uint idx_out) {\n"
" uint scale = width_in / width_out;"
" uint scale = width_in / width_out;\n"
" uint y = (idx_out / width_out) * scale;\n"
" uint x = (idx_out % width_out) * scale;\n"
" return y * width_in + x;\n"
@ -49,14 +49,12 @@ const char *pack_d24_unorm_s8_uint_to_z24s8_glsl =
"}\n";
const char *unpack_z24s8_to_d24_unorm_s8_uint_glsl =
"#version 450\n"
"layout(local_size_x = 256) in;\n"
"layout(push_constant) uniform PushConstants { uint width_in, width_out; };\n"
"layout(binding = 0) buffer DepthOut { uint depth_out[]; };\n"
"layout(binding = 1) buffer StencilOut { uint stencil_out[]; };\n"
"layout(binding = 2) buffer DepthStencilIn { uint depth_stencil_in[]; };\n"
"uint get_input_idx(uint idx_out) {\n"
" uint scale = width_out / width_in;"
" uint scale = width_out / width_in;\n"
" uint y = (idx_out / width_out) / scale;\n"
" uint x = (idx_out % width_out) / scale;\n"
" return y * width_in + x;\n"
@ -76,14 +74,12 @@ const char *unpack_z24s8_to_d24_unorm_s8_uint_glsl =
"}\n";
const char *pack_d32_sfloat_s8_uint_to_z24s8_glsl =
"#version 450\n"
"layout(local_size_x = 256) in;\n"
"layout(push_constant) uniform PushConstants { uint width_in, width_out; };\n"
"layout(binding = 0) buffer DepthIn { float depth_in[]; };\n"
"layout(binding = 1) buffer StencilIn { uint stencil_in[]; };\n"
"layout(binding = 2) buffer DepthStencilOut { uint depth_stencil_out[]; };\n"
"uint get_input_idx(uint idx_out) {\n"
" uint scale = width_in / width_out;"
" uint scale = width_in / width_out;\n"
" uint y = (idx_out / width_out) * scale;\n"
" uint x = (idx_out % width_out) * scale;\n"
" return y * width_in + x;\n"
@ -97,14 +93,12 @@ const char *pack_d32_sfloat_s8_uint_to_z24s8_glsl =
"}\n";
const char *unpack_z24s8_to_d32_sfloat_s8_uint_glsl =
"#version 450\n"
"layout(local_size_x = 256) in;\n"
"layout(push_constant) uniform PushConstants { uint width_in, width_out; };\n"
"layout(binding = 0) buffer DepthOut { float depth_out[]; };\n"
"layout(binding = 1) buffer StencilOut { uint stencil_out[]; };\n"
"layout(binding = 2) buffer DepthStencilIn { uint depth_stencil_in[]; };\n"
"uint get_input_idx(uint idx_out) {\n"
" uint scale = width_out / width_in;"
" uint scale = width_out / width_in;\n"
" uint y = (idx_out / width_out) / scale;\n"
" uint x = (idx_out % width_out) / scale;\n"
" return y * width_in + x;\n"
@ -123,6 +117,35 @@ const char *unpack_z24s8_to_d32_sfloat_s8_uint_glsl =
" }\n"
"}\n";
static gchar *get_compute_shader_glsl(VkFormat host_fmt, bool pack,
int workgroup_size)
{
const char *template;
switch (host_fmt) {
case VK_FORMAT_D24_UNORM_S8_UINT:
template = pack ? pack_d24_unorm_s8_uint_to_z24s8_glsl :
unpack_z24s8_to_d24_unorm_s8_uint_glsl;
break;
case VK_FORMAT_D32_SFLOAT_S8_UINT:
template = pack ? pack_d32_sfloat_s8_uint_to_z24s8_glsl :
unpack_z24s8_to_d32_sfloat_s8_uint_glsl;
break;
default:
assert(!"Unsupported host fmt");
break;
}
assert(template);
gchar *glsl = g_strdup_printf(
"#version 450\n"
"layout(local_size_x = %d) in;\n"
"%s", workgroup_size, template);
assert(glsl);
return glsl;
}
static void create_descriptor_pool(PGRAPHState *pg)
{
PGRAPHVkState *r = pg->vk_renderer_state;
@ -235,10 +258,14 @@ static void create_compute_pipeline_layout(PGRAPHState *pg)
&r->compute.pipeline_layout));
}
static VkPipeline create_compute_pipeline(PGRAPHState *pg, const char *glsl)
static void destroy_compute_pipeline_layout(PGRAPHVkState *r)
{
PGRAPHVkState *r = pg->vk_renderer_state;
vkDestroyPipelineLayout(r->device, r->compute.pipeline_layout, NULL);
r->compute.pipeline_layout = VK_NULL_HANDLE;
}
static VkPipeline create_compute_pipeline(PGRAPHVkState *r, const char *glsl)
{
ShaderModuleInfo *module = pgraph_vk_create_shader_module_from_glsl(
r, VK_SHADER_STAGE_COMPUTE_BIT, glsl);
@ -304,6 +331,47 @@ void pgraph_vk_compute_finish_complete(PGRAPHVkState *r)
r->compute.descriptor_set_index = 0;
}
static int get_workgroup_size_for_output_units(PGRAPHVkState *r, int output_units)
{
int group_size = 1024;
// FIXME: Smarter workgroup size calculation could factor in multiple
// submissions. For now we will just pick the highest number that
// evenly divides output_units.
while (group_size > 1) {
if (group_size > r->device_props.limits.maxComputeWorkGroupSize[0]) {
continue;
}
if (output_units % group_size == 0) {
break;
}
group_size /= 2;
}
return group_size;
}
static ComputePipeline *get_compute_pipeline(PGRAPHVkState *r, VkFormat host_fmt, bool pack, int output_units)
{
int workgroup_size = get_workgroup_size_for_output_units(r, output_units);
ComputePipelineKey key;
memset(&key, 0, sizeof(key));
key.host_fmt = host_fmt;
key.pack = pack;
key.workgroup_size = workgroup_size;
LruNode *node = lru_lookup(&r->compute.pipeline_cache,
fast_hash((void *)&key, sizeof(key)), &key);
ComputePipeline *pipeline = container_of(node, ComputePipeline, node);
assert(pipeline);
return pipeline;
}
//
// Pack depth+stencil into NV097_SET_SURFACE_FORMAT_ZETA_Z24S8
// formatted buffer with depth in bits 31-8 and stencil in bits 7-0.
@ -351,15 +419,20 @@ void pgraph_vk_pack_depth_stencil(PGRAPHState *pg, SurfaceBinding *surface,
update_descriptor_sets(pg, buffers, ARRAY_SIZE(buffers));
if (surface->host_fmt.vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
r->compute.pipeline_pack_d24s8);
} else if (surface->host_fmt.vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
r->compute.pipeline_pack_f32s8);
} else {
assert(!"Unsupported pack format");
}
size_t output_size_in_units = output_width * output_height;
ComputePipeline *pipeline = get_compute_pipeline(
r, surface->host_fmt.vk_format, true, output_size_in_units);
size_t workgroup_size_in_units = pipeline->key.workgroup_size;
assert(output_size_in_units % workgroup_size_in_units == 0);
size_t group_count = output_size_in_units / workgroup_size_in_units;
assert(r->device_props.limits.maxComputeWorkGroupSize[0] >= workgroup_size_in_units);
assert(r->device_props.limits.maxComputeWorkGroupCount[0] >= group_count);
// FIXME: Smarter workgroup scaling
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline);
vkCmdBindDescriptorSets(
cmd, VK_PIPELINE_BIND_POINT_COMPUTE, r->compute.pipeline_layout, 0, 1,
&r->compute.descriptor_sets[r->compute.descriptor_set_index - 1], 0,
@ -371,11 +444,6 @@ void pgraph_vk_pack_depth_stencil(PGRAPHState *pg, SurfaceBinding *surface,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
push_constants);
size_t workgroup_size_in_units = 256;
size_t output_size_in_units = output_width * output_height;
assert(output_size_in_units % workgroup_size_in_units == 0);
size_t group_count = output_size_in_units / workgroup_size_in_units;
// FIXME: Check max group count
vkCmdDispatch(cmd, group_count, 1, 1);
@ -420,15 +488,20 @@ void pgraph_vk_unpack_depth_stencil(PGRAPHState *pg, SurfaceBinding *surface,
};
update_descriptor_sets(pg, buffers, ARRAY_SIZE(buffers));
if (surface->host_fmt.vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
r->compute.pipeline_unpack_d24s8);
} else if (surface->host_fmt.vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
r->compute.pipeline_unpack_f32s8);
} else {
assert(!"Unsupported pack format");
}
size_t output_size_in_units = output_width * output_height;
ComputePipeline *pipeline = get_compute_pipeline(
r, surface->host_fmt.vk_format, false, output_size_in_units);
size_t workgroup_size_in_units = pipeline->key.workgroup_size;
assert(output_size_in_units % workgroup_size_in_units == 0);
size_t group_count = output_size_in_units / workgroup_size_in_units;
assert(r->device_props.limits.maxComputeWorkGroupSize[0] >= workgroup_size_in_units);
assert(r->device_props.limits.maxComputeWorkGroupCount[0] >= group_count);
// FIXME: Smarter workgroup scaling
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline);
vkCmdBindDescriptorSets(
cmd, VK_PIPELINE_BIND_POINT_COMPUTE, r->compute.pipeline_layout, 0, 1,
&r->compute.descriptor_sets[r->compute.descriptor_set_index - 1], 0,
@ -440,17 +513,75 @@ void pgraph_vk_unpack_depth_stencil(PGRAPHState *pg, SurfaceBinding *surface,
vkCmdPushConstants(cmd, r->compute.pipeline_layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
push_constants);
size_t workgroup_size_in_units = 256;
size_t output_size_in_units = output_width * output_height;
assert(output_size_in_units % workgroup_size_in_units == 0);
size_t group_count = output_size_in_units / workgroup_size_in_units;
// FIXME: Check max group count
vkCmdDispatch(cmd, group_count, 1, 1);
}
static void pipeline_cache_entry_init(Lru *lru, LruNode *node, void *state)
{
PGRAPHVkState *r = container_of(lru, PGRAPHVkState, compute.pipeline_cache);
ComputePipeline *snode = container_of(node, ComputePipeline, node);
memcpy(&snode->key, state, sizeof(snode->key));
if (snode->key.workgroup_size == 1) {
fprintf(stderr,
"Warning: Needed compute shader with workgroup size = 1\n");
}
gchar *glsl = get_compute_shader_glsl(
snode->key.host_fmt, snode->key.pack, snode->key.workgroup_size);
assert(glsl);
snode->pipeline = create_compute_pipeline(r, glsl);
g_free(glsl);
}
static void pipeline_cache_release_node_resources(PGRAPHVkState *r, ComputePipeline *snode)
{
vkDestroyPipeline(r->device, snode->pipeline, NULL);
snode->pipeline = VK_NULL_HANDLE;
}
static bool pipeline_cache_entry_pre_evict(Lru *lru, LruNode *node)
{
// FIXME: Check pipeline not in use
return false;
}
static void pipeline_cache_entry_post_evict(Lru *lru, LruNode *node)
{
PGRAPHVkState *r = container_of(lru, PGRAPHVkState, pipeline_cache);
ComputePipeline *snode = container_of(node, ComputePipeline, node);
pipeline_cache_release_node_resources(r, snode);
}
static bool pipeline_cache_entry_compare(Lru *lru, LruNode *node, void *key)
{
ComputePipeline *snode = container_of(node, ComputePipeline, node);
return memcmp(&snode->key, key, sizeof(ComputePipelineKey));
}
static void pipeline_cache_init(PGRAPHVkState *r)
{
const size_t pipeline_cache_size = 100; // FIXME: Trim
lru_init(&r->compute.pipeline_cache);
r->compute.pipeline_cache_entries = g_malloc_n(pipeline_cache_size, sizeof(ComputePipeline));
assert(r->compute.pipeline_cache_entries != NULL);
for (int i = 0; i < pipeline_cache_size; i++) {
lru_add_free(&r->compute.pipeline_cache, &r->compute.pipeline_cache_entries[i].node);
}
r->compute.pipeline_cache.init_node = pipeline_cache_entry_init;
r->compute.pipeline_cache.compare_nodes = pipeline_cache_entry_compare;
r->compute.pipeline_cache.pre_node_evict = pipeline_cache_entry_pre_evict;
r->compute.pipeline_cache.post_node_evict = pipeline_cache_entry_post_evict;
}
static void pipeline_cache_finalize(PGRAPHVkState *r)
{
lru_flush(&r->compute.pipeline_cache);
g_free(r->compute.pipeline_cache_entries);
r->compute.pipeline_cache_entries = NULL;
}
void pgraph_vk_init_compute(PGRAPHState *pg)
{
PGRAPHVkState *r = pg->vk_renderer_state;
@ -459,36 +590,15 @@ void pgraph_vk_init_compute(PGRAPHState *pg)
create_descriptor_set_layout(pg);
create_descriptor_sets(pg);
create_compute_pipeline_layout(pg);
r->compute.pipeline_pack_d24s8 =
create_compute_pipeline(pg, pack_d24_unorm_s8_uint_to_z24s8_glsl);
r->compute.pipeline_unpack_d24s8 =
create_compute_pipeline(pg, unpack_z24s8_to_d24_unorm_s8_uint_glsl);
r->compute.pipeline_pack_f32s8 =
create_compute_pipeline(pg, pack_d32_sfloat_s8_uint_to_z24s8_glsl);
r->compute.pipeline_unpack_f32s8 =
create_compute_pipeline(pg, unpack_z24s8_to_d32_sfloat_s8_uint_glsl);
pipeline_cache_init(r);
}
void pgraph_vk_finalize_compute(PGRAPHState *pg)
{
PGRAPHVkState *r = pg->vk_renderer_state;
VkPipeline *pipelines[] = {
&r->compute.pipeline_pack_d24s8,
&r->compute.pipeline_unpack_d24s8,
&r->compute.pipeline_pack_f32s8,
&r->compute.pipeline_unpack_f32s8,
};
for (int i = 0; i < ARRAY_SIZE(pipelines); i++) {
vkDestroyPipeline(r->device, *pipelines[i], NULL);
pipelines[i] = VK_NULL_HANDLE;
}
vkDestroyPipelineLayout(r->device, r->compute.pipeline_layout, NULL);
r->compute.pipeline_layout = VK_NULL_HANDLE;
pipeline_cache_finalize(r);
destroy_compute_pipeline_layout(r);
destroy_descriptor_sets(pg);
destroy_descriptor_set_layout(pg);
destroy_descriptor_pool(pg);