mirror of https://github.com/xemu-project/xemu.git
611 lines
22 KiB
C
611 lines
22 KiB
C
/*
|
|
* Geforce NV2A PGRAPH Vulkan Renderer
|
|
*
|
|
* Copyright (c) 2024 Matt Borgerson
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "hw/xbox/nv2a/pgraph/pgraph.h"
|
|
#include "qemu/fast-hash.h"
|
|
#include "qemu/lru.h"
|
|
#include "renderer.h"
|
|
#include <vulkan/vulkan_core.h>
|
|
|
|
// TODO: Swizzle/Unswizzle
|
|
// TODO: Float depth format (low priority, but would be better for accuracy)
|
|
|
|
// FIXME: Below pipeline creation assumes identical 3 buffer setup. For
|
|
// swizzle shader we will need more flexibility.
|
|
|
|
const char *pack_d24_unorm_s8_uint_to_z24s8_glsl =
|
|
"layout(push_constant) uniform PushConstants { uint width_in, width_out; };\n"
|
|
"layout(set = 0, binding = 0) buffer DepthIn { uint depth_in[]; };\n"
|
|
"layout(set = 0, binding = 1) buffer StencilIn { uint stencil_in[]; };\n"
|
|
"layout(set = 0, binding = 2) buffer DepthStencilOut { uint depth_stencil_out[]; };\n"
|
|
"uint get_input_idx(uint idx_out) {\n"
|
|
" uint scale = width_in / width_out;\n"
|
|
" uint y = (idx_out / width_out) * scale;\n"
|
|
" uint x = (idx_out % width_out) * scale;\n"
|
|
" return y * width_in + x;\n"
|
|
"}\n"
|
|
"void main() {\n"
|
|
" uint idx_out = gl_GlobalInvocationID.x;\n"
|
|
" uint idx_in = get_input_idx(idx_out);\n"
|
|
" uint depth_value = depth_in[idx_in];\n"
|
|
" uint stencil_value = (stencil_in[idx_in / 4] >> ((idx_in % 4) * 8)) & 0xff;\n"
|
|
" depth_stencil_out[idx_out] = depth_value << 8 | stencil_value;\n"
|
|
"}\n";
|
|
|
|
const char *unpack_z24s8_to_d24_unorm_s8_uint_glsl =
|
|
"layout(push_constant) uniform PushConstants { uint width_in, width_out; };\n"
|
|
"layout(set = 0, binding = 0) buffer DepthOut { uint depth_out[]; };\n"
|
|
"layout(set = 0, binding = 1) buffer StencilOut { uint stencil_out[]; };\n"
|
|
"layout(set = 0, binding = 2) buffer DepthStencilIn { uint depth_stencil_in[]; };\n"
|
|
"uint get_input_idx(uint idx_out) {\n"
|
|
" uint scale = width_out / width_in;\n"
|
|
" uint y = (idx_out / width_out) / scale;\n"
|
|
" uint x = (idx_out % width_out) / scale;\n"
|
|
" return y * width_in + x;\n"
|
|
"}\n"
|
|
"void main() {\n"
|
|
" uint idx_out = gl_GlobalInvocationID.x;\n"
|
|
" uint idx_in = get_input_idx(idx_out);\n"
|
|
" depth_out[idx_out] = depth_stencil_in[idx_in] >> 8;\n"
|
|
" if (idx_out % 4 == 0) {\n"
|
|
" uint stencil_value = 0;\n"
|
|
" for (int i = 0; i < 4; i++) {\n" // Include next 3 pixels
|
|
" uint v = depth_stencil_in[get_input_idx(idx_out + i)] & 0xff;\n"
|
|
" stencil_value |= v << (i * 8);\n"
|
|
" }\n"
|
|
" stencil_out[idx_out / 4] = stencil_value;\n"
|
|
" }\n"
|
|
"}\n";
|
|
|
|
const char *pack_d32_sfloat_s8_uint_to_z24s8_glsl =
|
|
"layout(push_constant) uniform PushConstants { uint width_in, width_out; };\n"
|
|
"layout(set = 0, binding = 0) buffer DepthIn { float depth_in[]; };\n"
|
|
"layout(set = 0, binding = 1) buffer StencilIn { uint stencil_in[]; };\n"
|
|
"layout(set = 0, binding = 2) buffer DepthStencilOut { uint depth_stencil_out[]; };\n"
|
|
"uint get_input_idx(uint idx_out) {\n"
|
|
" uint scale = width_in / width_out;\n"
|
|
" uint y = (idx_out / width_out) * scale;\n"
|
|
" uint x = (idx_out % width_out) * scale;\n"
|
|
" return y * width_in + x;\n"
|
|
"}\n"
|
|
"void main() {\n"
|
|
" uint idx_out = gl_GlobalInvocationID.x;\n"
|
|
" uint idx_in = get_input_idx(idx_out);\n"
|
|
" uint depth_value = int(depth_in[idx_in] * float(0xffffff));\n"
|
|
" uint stencil_value = (stencil_in[idx_in / 4] >> ((idx_in % 4) * 8)) & 0xff;\n"
|
|
" depth_stencil_out[idx_out] = depth_value << 8 | stencil_value;\n"
|
|
"}\n";
|
|
|
|
const char *unpack_z24s8_to_d32_sfloat_s8_uint_glsl =
|
|
"layout(push_constant) uniform PushConstants { uint width_in, width_out; };\n"
|
|
"layout(set = 0, binding = 0) buffer DepthOut { float depth_out[]; };\n"
|
|
"layout(set = 0, binding = 1) buffer StencilOut { uint stencil_out[]; };\n"
|
|
"layout(set = 0, binding = 2) buffer DepthStencilIn { uint depth_stencil_in[]; };\n"
|
|
"uint get_input_idx(uint idx_out) {\n"
|
|
" uint scale = width_out / width_in;\n"
|
|
" uint y = (idx_out / width_out) / scale;\n"
|
|
" uint x = (idx_out % width_out) / scale;\n"
|
|
" return y * width_in + x;\n"
|
|
"}\n"
|
|
"void main() {\n"
|
|
" uint idx_out = gl_GlobalInvocationID.x;\n"
|
|
" uint idx_in = get_input_idx(idx_out);\n"
|
|
" depth_out[idx_out] = float(depth_stencil_in[idx_in] >> 8) / float(0xffffff);\n"
|
|
" if (idx_out % 4 == 0) {\n"
|
|
" uint stencil_value = 0;\n"
|
|
" for (int i = 0; i < 4; i++) {\n" // Include next 3 pixels
|
|
" uint v = depth_stencil_in[get_input_idx(idx_out + i)] & 0xff;\n"
|
|
" stencil_value |= v << (i * 8);\n"
|
|
" }\n"
|
|
" stencil_out[idx_out / 4] = stencil_value;\n"
|
|
" }\n"
|
|
"}\n";
|
|
|
|
static gchar *get_compute_shader_glsl(VkFormat host_fmt, bool pack,
|
|
int workgroup_size)
|
|
{
|
|
const char *template;
|
|
|
|
switch (host_fmt) {
|
|
case VK_FORMAT_D24_UNORM_S8_UINT:
|
|
template = pack ? pack_d24_unorm_s8_uint_to_z24s8_glsl :
|
|
unpack_z24s8_to_d24_unorm_s8_uint_glsl;
|
|
break;
|
|
case VK_FORMAT_D32_SFLOAT_S8_UINT:
|
|
template = pack ? pack_d32_sfloat_s8_uint_to_z24s8_glsl :
|
|
unpack_z24s8_to_d32_sfloat_s8_uint_glsl;
|
|
break;
|
|
default:
|
|
assert(!"Unsupported host fmt");
|
|
break;
|
|
}
|
|
assert(template);
|
|
|
|
gchar *glsl = g_strdup_printf(
|
|
"#version 450\n"
|
|
"layout(local_size_x = %d, local_size_y = 1, local_size_z = 1) in;\n"
|
|
"%s", workgroup_size, template);
|
|
assert(glsl);
|
|
|
|
return glsl;
|
|
}
|
|
|
|
static void create_descriptor_pool(PGRAPHState *pg)
|
|
{
|
|
PGRAPHVkState *r = pg->vk_renderer_state;
|
|
|
|
VkDescriptorPoolSize pool_sizes[] = {
|
|
{
|
|
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
|
.descriptorCount = 3 * ARRAY_SIZE(r->compute.descriptor_sets),
|
|
},
|
|
};
|
|
|
|
VkDescriptorPoolCreateInfo pool_info = {
|
|
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
|
|
.poolSizeCount = ARRAY_SIZE(pool_sizes),
|
|
.pPoolSizes = pool_sizes,
|
|
.maxSets = ARRAY_SIZE(r->compute.descriptor_sets),
|
|
.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
|
|
};
|
|
VK_CHECK(vkCreateDescriptorPool(r->device, &pool_info, NULL,
|
|
&r->compute.descriptor_pool));
|
|
}
|
|
|
|
static void destroy_descriptor_pool(PGRAPHState *pg)
|
|
{
|
|
PGRAPHVkState *r = pg->vk_renderer_state;
|
|
|
|
vkDestroyDescriptorPool(r->device, r->compute.descriptor_pool, NULL);
|
|
r->compute.descriptor_pool = VK_NULL_HANDLE;
|
|
}
|
|
|
|
static void create_descriptor_set_layout(PGRAPHState *pg)
|
|
{
|
|
PGRAPHVkState *r = pg->vk_renderer_state;
|
|
|
|
const int num_buffers = 3;
|
|
|
|
VkDescriptorSetLayoutBinding bindings[num_buffers];
|
|
for (int i = 0; i < num_buffers; i++) {
|
|
bindings[i] = (VkDescriptorSetLayoutBinding){
|
|
.binding = i,
|
|
.descriptorCount = 1,
|
|
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
|
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
|
};
|
|
}
|
|
VkDescriptorSetLayoutCreateInfo layout_info = {
|
|
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
|
|
.bindingCount = ARRAY_SIZE(bindings),
|
|
.pBindings = bindings,
|
|
};
|
|
VK_CHECK(vkCreateDescriptorSetLayout(r->device, &layout_info, NULL,
|
|
&r->compute.descriptor_set_layout));
|
|
}
|
|
|
|
static void destroy_descriptor_set_layout(PGRAPHState *pg)
|
|
{
|
|
PGRAPHVkState *r = pg->vk_renderer_state;
|
|
|
|
vkDestroyDescriptorSetLayout(r->device, r->compute.descriptor_set_layout,
|
|
NULL);
|
|
r->compute.descriptor_set_layout = VK_NULL_HANDLE;
|
|
}
|
|
|
|
static void create_descriptor_sets(PGRAPHState *pg)
|
|
{
|
|
PGRAPHVkState *r = pg->vk_renderer_state;
|
|
|
|
VkDescriptorSetLayout layouts[ARRAY_SIZE(r->compute.descriptor_sets)];
|
|
for (int i = 0; i < ARRAY_SIZE(layouts); i++) {
|
|
layouts[i] = r->compute.descriptor_set_layout;
|
|
}
|
|
VkDescriptorSetAllocateInfo alloc_info = {
|
|
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
|
|
.descriptorPool = r->compute.descriptor_pool,
|
|
.descriptorSetCount = ARRAY_SIZE(r->compute.descriptor_sets),
|
|
.pSetLayouts = layouts,
|
|
};
|
|
VK_CHECK(vkAllocateDescriptorSets(r->device, &alloc_info,
|
|
r->compute.descriptor_sets));
|
|
}
|
|
|
|
static void destroy_descriptor_sets(PGRAPHState *pg)
|
|
{
|
|
PGRAPHVkState *r = pg->vk_renderer_state;
|
|
|
|
vkFreeDescriptorSets(r->device, r->compute.descriptor_pool,
|
|
ARRAY_SIZE(r->compute.descriptor_sets),
|
|
r->compute.descriptor_sets);
|
|
for (int i = 0; i < ARRAY_SIZE(r->compute.descriptor_sets); i++) {
|
|
r->compute.descriptor_sets[i] = VK_NULL_HANDLE;
|
|
}
|
|
}
|
|
|
|
static void create_compute_pipeline_layout(PGRAPHState *pg)
|
|
{
|
|
PGRAPHVkState *r = pg->vk_renderer_state;
|
|
|
|
VkPushConstantRange push_constant_range = {
|
|
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
|
.size = 2 * sizeof(uint32_t),
|
|
};
|
|
VkPipelineLayoutCreateInfo pipeline_layout_info = {
|
|
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
|
|
.setLayoutCount = 1,
|
|
.pSetLayouts = &r->compute.descriptor_set_layout,
|
|
.pushConstantRangeCount = 1,
|
|
.pPushConstantRanges = &push_constant_range,
|
|
};
|
|
VK_CHECK(vkCreatePipelineLayout(r->device, &pipeline_layout_info, NULL,
|
|
&r->compute.pipeline_layout));
|
|
}
|
|
|
|
static void destroy_compute_pipeline_layout(PGRAPHVkState *r)
|
|
{
|
|
vkDestroyPipelineLayout(r->device, r->compute.pipeline_layout, NULL);
|
|
r->compute.pipeline_layout = VK_NULL_HANDLE;
|
|
}
|
|
|
|
static VkPipeline create_compute_pipeline(PGRAPHVkState *r, const char *glsl)
|
|
{
|
|
ShaderModuleInfo *module = pgraph_vk_create_shader_module_from_glsl(
|
|
r, VK_SHADER_STAGE_COMPUTE_BIT, glsl);
|
|
|
|
VkComputePipelineCreateInfo pipeline_info = {
|
|
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
|
|
.layout = r->compute.pipeline_layout,
|
|
.stage =
|
|
(VkPipelineShaderStageCreateInfo){
|
|
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
|
|
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
|
|
.pName = "main",
|
|
.module = module->module,
|
|
},
|
|
};
|
|
VkPipeline pipeline;
|
|
VK_CHECK(vkCreateComputePipelines(r->device, r->vk_pipeline_cache, 1,
|
|
&pipeline_info, NULL,
|
|
&pipeline));
|
|
|
|
pgraph_vk_destroy_shader_module(r, module);
|
|
|
|
return pipeline;
|
|
}
|
|
|
|
static void update_descriptor_sets(PGRAPHState *pg,
|
|
VkDescriptorBufferInfo *buffers, int count)
|
|
{
|
|
PGRAPHVkState *r = pg->vk_renderer_state;
|
|
|
|
assert(count == 3);
|
|
VkWriteDescriptorSet descriptor_writes[3];
|
|
|
|
assert(r->compute.descriptor_set_index <
|
|
ARRAY_SIZE(r->compute.descriptor_sets));
|
|
|
|
for (int i = 0; i < count; i++) {
|
|
descriptor_writes[i] = (VkWriteDescriptorSet){
|
|
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
|
|
.dstSet =
|
|
r->compute.descriptor_sets[r->compute.descriptor_set_index],
|
|
.dstBinding = i,
|
|
.dstArrayElement = 0,
|
|
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
|
.descriptorCount = 1,
|
|
.pBufferInfo = &buffers[i],
|
|
};
|
|
}
|
|
vkUpdateDescriptorSets(r->device, count, descriptor_writes, 0, NULL);
|
|
|
|
r->compute.descriptor_set_index += 1;
|
|
}
|
|
|
|
bool pgraph_vk_compute_needs_finish(PGRAPHVkState *r)
|
|
{
|
|
bool need_descriptor_write_reset = (r->compute.descriptor_set_index >=
|
|
ARRAY_SIZE(r->compute.descriptor_sets));
|
|
|
|
return need_descriptor_write_reset;
|
|
}
|
|
|
|
void pgraph_vk_compute_finish_complete(PGRAPHVkState *r)
|
|
{
|
|
r->compute.descriptor_set_index = 0;
|
|
}
|
|
|
|
static int get_workgroup_size_for_output_units(PGRAPHVkState *r, int output_units)
|
|
{
|
|
int group_size = 1024;
|
|
|
|
// FIXME: Smarter workgroup size calculation could factor in multiple
|
|
// submissions. For now we will just pick the highest number that
|
|
// evenly divides output_units.
|
|
|
|
while (group_size > 1) {
|
|
if (group_size > r->device_props.limits.maxComputeWorkGroupSize[0]) {
|
|
continue;
|
|
}
|
|
if (output_units % group_size == 0) {
|
|
break;
|
|
}
|
|
group_size /= 2;
|
|
}
|
|
|
|
return group_size;
|
|
}
|
|
|
|
static ComputePipeline *get_compute_pipeline(PGRAPHVkState *r, VkFormat host_fmt, bool pack, int output_units)
|
|
{
|
|
int workgroup_size = get_workgroup_size_for_output_units(r, output_units);
|
|
|
|
ComputePipelineKey key;
|
|
memset(&key, 0, sizeof(key));
|
|
|
|
key.host_fmt = host_fmt;
|
|
key.pack = pack;
|
|
key.workgroup_size = workgroup_size;
|
|
|
|
LruNode *node = lru_lookup(&r->compute.pipeline_cache,
|
|
fast_hash((void *)&key, sizeof(key)), &key);
|
|
ComputePipeline *pipeline = container_of(node, ComputePipeline, node);
|
|
|
|
assert(pipeline);
|
|
|
|
return pipeline;
|
|
}
|
|
|
|
//
|
|
// Pack depth+stencil into NV097_SET_SURFACE_FORMAT_ZETA_Z24S8
|
|
// formatted buffer with depth in bits 31-8 and stencil in bits 7-0.
|
|
//
|
|
void pgraph_vk_pack_depth_stencil(PGRAPHState *pg, SurfaceBinding *surface,
|
|
VkCommandBuffer cmd, VkBuffer src,
|
|
VkBuffer dst, bool downscale)
|
|
{
|
|
PGRAPHVkState *r = pg->vk_renderer_state;
|
|
|
|
unsigned int input_width = surface->width, input_height = surface->height;
|
|
pgraph_apply_scaling_factor(pg, &input_width, &input_height);
|
|
|
|
unsigned int output_width = surface->width, output_height = surface->height;
|
|
if (!downscale) {
|
|
pgraph_apply_scaling_factor(pg, &output_width, &output_height);
|
|
}
|
|
|
|
size_t depth_bytes_per_pixel = 4;
|
|
size_t depth_size = input_width * input_height * depth_bytes_per_pixel;
|
|
|
|
size_t stencil_bytes_per_pixel = 1;
|
|
size_t stencil_size = input_width * input_height * stencil_bytes_per_pixel;
|
|
|
|
size_t output_bytes_per_pixel = 4;
|
|
size_t output_size = output_width * output_height * output_bytes_per_pixel;
|
|
|
|
VkDescriptorBufferInfo buffers[] = {
|
|
{
|
|
.buffer = src,
|
|
.offset = 0,
|
|
.range = depth_size,
|
|
},
|
|
{
|
|
.buffer = src,
|
|
.offset = ROUND_UP(
|
|
depth_size,
|
|
r->device_props.limits.minStorageBufferOffsetAlignment),
|
|
.range = stencil_size,
|
|
},
|
|
{
|
|
.buffer = dst,
|
|
.offset = 0,
|
|
.range = output_size,
|
|
},
|
|
};
|
|
|
|
update_descriptor_sets(pg, buffers, ARRAY_SIZE(buffers));
|
|
|
|
size_t output_size_in_units = output_width * output_height;
|
|
ComputePipeline *pipeline = get_compute_pipeline(
|
|
r, surface->host_fmt.vk_format, true, output_size_in_units);
|
|
|
|
size_t workgroup_size_in_units = pipeline->key.workgroup_size;
|
|
assert(output_size_in_units % workgroup_size_in_units == 0);
|
|
size_t group_count = output_size_in_units / workgroup_size_in_units;
|
|
|
|
assert(r->device_props.limits.maxComputeWorkGroupSize[0] >= workgroup_size_in_units);
|
|
assert(r->device_props.limits.maxComputeWorkGroupCount[0] >= group_count);
|
|
|
|
// FIXME: Smarter workgroup scaling
|
|
|
|
pgraph_vk_begin_debug_marker(r, cmd, RGBA_PINK, __func__);
|
|
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline);
|
|
vkCmdBindDescriptorSets(
|
|
cmd, VK_PIPELINE_BIND_POINT_COMPUTE, r->compute.pipeline_layout, 0, 1,
|
|
&r->compute.descriptor_sets[r->compute.descriptor_set_index - 1], 0,
|
|
NULL);
|
|
|
|
uint32_t push_constants[2] = { input_width, output_width };
|
|
assert(sizeof(push_constants) == 8);
|
|
vkCmdPushConstants(cmd, r->compute.pipeline_layout,
|
|
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
|
|
push_constants);
|
|
|
|
// FIXME: Check max group count
|
|
|
|
vkCmdDispatch(cmd, group_count, 1, 1);
|
|
pgraph_vk_end_debug_marker(r, cmd);
|
|
}
|
|
|
|
void pgraph_vk_unpack_depth_stencil(PGRAPHState *pg, SurfaceBinding *surface,
|
|
VkCommandBuffer cmd, VkBuffer src,
|
|
VkBuffer dst)
|
|
{
|
|
PGRAPHVkState *r = pg->vk_renderer_state;
|
|
|
|
unsigned int input_width = surface->width, input_height = surface->height;
|
|
|
|
unsigned int output_width = surface->width, output_height = surface->height;
|
|
pgraph_apply_scaling_factor(pg, &output_width, &output_height);
|
|
|
|
size_t depth_bytes_per_pixel = 4;
|
|
size_t depth_size = output_width * output_height * depth_bytes_per_pixel;
|
|
|
|
size_t stencil_bytes_per_pixel = 1;
|
|
size_t stencil_size = output_width * output_height * stencil_bytes_per_pixel;
|
|
|
|
size_t input_bytes_per_pixel = 4;
|
|
size_t input_size = input_width * input_height * input_bytes_per_pixel;
|
|
|
|
VkDescriptorBufferInfo buffers[] = {
|
|
{
|
|
.buffer = dst,
|
|
.offset = 0,
|
|
.range = depth_size,
|
|
},
|
|
{
|
|
.buffer = dst,
|
|
.offset = ROUND_UP(
|
|
depth_size,
|
|
r->device_props.limits.minStorageBufferOffsetAlignment),
|
|
.range = stencil_size,
|
|
},
|
|
{
|
|
.buffer = src,
|
|
.offset = 0,
|
|
.range = input_size,
|
|
},
|
|
};
|
|
update_descriptor_sets(pg, buffers, ARRAY_SIZE(buffers));
|
|
|
|
size_t output_size_in_units = output_width * output_height;
|
|
ComputePipeline *pipeline = get_compute_pipeline(
|
|
r, surface->host_fmt.vk_format, false, output_size_in_units);
|
|
|
|
size_t workgroup_size_in_units = pipeline->key.workgroup_size;
|
|
assert(output_size_in_units % workgroup_size_in_units == 0);
|
|
size_t group_count = output_size_in_units / workgroup_size_in_units;
|
|
|
|
assert(r->device_props.limits.maxComputeWorkGroupSize[0] >= workgroup_size_in_units);
|
|
assert(r->device_props.limits.maxComputeWorkGroupCount[0] >= group_count);
|
|
|
|
// FIXME: Smarter workgroup scaling
|
|
|
|
pgraph_vk_begin_debug_marker(r, cmd, RGBA_PINK, __func__);
|
|
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline);
|
|
vkCmdBindDescriptorSets(
|
|
cmd, VK_PIPELINE_BIND_POINT_COMPUTE, r->compute.pipeline_layout, 0, 1,
|
|
&r->compute.descriptor_sets[r->compute.descriptor_set_index - 1], 0,
|
|
NULL);
|
|
|
|
assert(output_width >= input_width);
|
|
uint32_t push_constants[2] = { input_width, output_width };
|
|
assert(sizeof(push_constants) == 8);
|
|
vkCmdPushConstants(cmd, r->compute.pipeline_layout,
|
|
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
|
|
push_constants);
|
|
vkCmdDispatch(cmd, group_count, 1, 1);
|
|
pgraph_vk_end_debug_marker(r, cmd);
|
|
}
|
|
|
|
static void pipeline_cache_entry_init(Lru *lru, LruNode *node,
|
|
const void *state)
|
|
{
|
|
PGRAPHVkState *r = container_of(lru, PGRAPHVkState, compute.pipeline_cache);
|
|
ComputePipeline *snode = container_of(node, ComputePipeline, node);
|
|
|
|
memcpy(&snode->key, state, sizeof(snode->key));
|
|
|
|
if (snode->key.workgroup_size == 1) {
|
|
fprintf(stderr,
|
|
"Warning: Needed compute shader with workgroup size = 1\n");
|
|
}
|
|
|
|
gchar *glsl = get_compute_shader_glsl(
|
|
snode->key.host_fmt, snode->key.pack, snode->key.workgroup_size);
|
|
assert(glsl);
|
|
snode->pipeline = create_compute_pipeline(r, glsl);
|
|
g_free(glsl);
|
|
}
|
|
|
|
static void pipeline_cache_release_node_resources(PGRAPHVkState *r, ComputePipeline *snode)
|
|
{
|
|
vkDestroyPipeline(r->device, snode->pipeline, NULL);
|
|
snode->pipeline = VK_NULL_HANDLE;
|
|
}
|
|
|
|
static void pipeline_cache_entry_post_evict(Lru *lru, LruNode *node)
|
|
{
|
|
PGRAPHVkState *r = container_of(lru, PGRAPHVkState, compute.pipeline_cache);
|
|
ComputePipeline *snode = container_of(node, ComputePipeline, node);
|
|
pipeline_cache_release_node_resources(r, snode);
|
|
}
|
|
|
|
static bool pipeline_cache_entry_compare(Lru *lru, LruNode *node,
|
|
const void *key)
|
|
{
|
|
ComputePipeline *snode = container_of(node, ComputePipeline, node);
|
|
return memcmp(&snode->key, key, sizeof(ComputePipelineKey));
|
|
}
|
|
|
|
static void pipeline_cache_init(PGRAPHVkState *r)
|
|
{
|
|
const size_t pipeline_cache_size = 100; // FIXME: Trim
|
|
lru_init(&r->compute.pipeline_cache);
|
|
r->compute.pipeline_cache_entries = g_malloc_n(pipeline_cache_size, sizeof(ComputePipeline));
|
|
assert(r->compute.pipeline_cache_entries != NULL);
|
|
for (int i = 0; i < pipeline_cache_size; i++) {
|
|
lru_add_free(&r->compute.pipeline_cache, &r->compute.pipeline_cache_entries[i].node);
|
|
}
|
|
r->compute.pipeline_cache.init_node = pipeline_cache_entry_init;
|
|
r->compute.pipeline_cache.compare_nodes = pipeline_cache_entry_compare;
|
|
r->compute.pipeline_cache.post_node_evict = pipeline_cache_entry_post_evict;
|
|
}
|
|
|
|
static void pipeline_cache_finalize(PGRAPHVkState *r)
|
|
{
|
|
lru_flush(&r->compute.pipeline_cache);
|
|
g_free(r->compute.pipeline_cache_entries);
|
|
r->compute.pipeline_cache_entries = NULL;
|
|
}
|
|
|
|
void pgraph_vk_init_compute(PGRAPHState *pg)
|
|
{
|
|
PGRAPHVkState *r = pg->vk_renderer_state;
|
|
|
|
create_descriptor_pool(pg);
|
|
create_descriptor_set_layout(pg);
|
|
create_descriptor_sets(pg);
|
|
create_compute_pipeline_layout(pg);
|
|
pipeline_cache_init(r);
|
|
}
|
|
|
|
void pgraph_vk_finalize_compute(PGRAPHState *pg)
|
|
{
|
|
PGRAPHVkState *r = pg->vk_renderer_state;
|
|
|
|
assert(!r->in_command_buffer);
|
|
|
|
pipeline_cache_finalize(r);
|
|
destroy_compute_pipeline_layout(r);
|
|
destroy_descriptor_sets(pg);
|
|
destroy_descriptor_set_layout(pg);
|
|
destroy_descriptor_pool(pg);
|
|
}
|