diff --git a/rpcs3/Emu/RSX/GL/GLCompute.h b/rpcs3/Emu/RSX/GL/GLCompute.h new file mode 100644 index 0000000000..8764f7f24a --- /dev/null +++ b/rpcs3/Emu/RSX/GL/GLCompute.h @@ -0,0 +1,393 @@ +#pragma once + +#include "Utilities/StrUtil.h" +#include "GLHelpers.h" + +namespace gl +{ + struct compute_task + { + std::string m_src; + gl::glsl::shader m_shader; + gl::glsl::program m_program; + bool compiled = false; + + // Device-specific options + bool unroll_loops = true; + u32 optimal_group_size = 1; + u32 optimal_kernel_size = 1; + + void create() + { + if (!compiled) + { + m_shader.create(gl::glsl::shader::type::compute); + m_shader.source(m_src); + m_shader.compile(); + + m_program.create(); + m_program.attach(m_shader); + m_program.make(); + + compiled = true; + } + } + + void destroy() + { + if (compiled) + { + m_program.remove(); + m_shader.remove(); + + compiled = false; + } + } + + virtual void bind_resources() + {} + + void run(u32 invocations_x, u32 invocations_y) + { + GLint old_program; + glGetIntegerv(GL_CURRENT_PROGRAM, &old_program); + + m_program.use(); + glDispatchCompute(invocations_x, invocations_y, 1); + + glUseProgram((GLuint)old_program); + } + + void run(u32 num_invocations) + { + run(num_invocations, 1); + } + }; + + struct cs_shuffle_base : compute_task + { + const gl::buffer* m_data = nullptr; + u32 m_data_offset = 0; + u32 m_data_length = 0; + u32 kernel_size = 1; + + std::string uniforms, variables, work_kernel, loop_advance, suffix; + + cs_shuffle_base() + { + work_kernel = + " value = data[index];\n" + " data[index] = %f(value);\n"; + + loop_advance = + " index++;\n"; + + suffix = + "}\n"; + } + + void build(const char* function_name, u32 _kernel_size = 0) + { + // Initialize to allow detecting optimal settings + create(); + + kernel_size = _kernel_size? _kernel_size : optimal_kernel_size; + + m_src = + "#version 430\n" + "layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n" + "layout(binding=%loc, std430) buffer ssbo{ uint data[]; };\n" + "%ub" + "\n" + "#define KERNEL_SIZE %ks\n" + "\n" + "// Generic swap routines\n" + "#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n" + "#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n" + "#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n" + "\n" + "// Depth format conversions\n" + "#define d24_to_f32(bits) floatBitsToUint(float(bits) / 16777215.f)\n" + "#define f32_to_d24(bits) uint(uintBitsToFloat(bits) * 16777215.f)\n" + "#define d24x8_to_f32(bits) d24_to_f32(bits >> 8)\n" + "#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n" + "#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n" + "\n" + "void main()\n" + "{\n" + " uint index = gl_GlobalInvocationID.x * KERNEL_SIZE;\n" + " uint value;\n" + " %vars" + "\n"; + + const std::pair syntax_replace[] = + { + { "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) }, + { "%ws", std::to_string(optimal_group_size) }, + { "%ks", std::to_string(kernel_size) }, + { "%vars", variables }, + { "%f", function_name }, + { "%ub", uniforms }, + }; + + m_src = fmt::replace_all(m_src, syntax_replace); + work_kernel = fmt::replace_all(work_kernel, syntax_replace); + + if (kernel_size <= 1) + { + m_src += " {\n" + work_kernel + " }\n"; + } + else if (unroll_loops) + { + work_kernel += loop_advance + "\n"; + + m_src += std::string + ( + " //Unrolled loop\n" + " {\n" + ); + + // Assemble body with manual loop unroll to try loweing GPR usage + for (u32 n = 0; n < kernel_size; ++n) + { + m_src += work_kernel; + } + + m_src += " }\n"; + } + else + { + m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n"; + m_src += " {\n"; + m_src += work_kernel; + m_src += loop_advance; + m_src += " }\n"; + } + + m_src += suffix; + } + + void bind_resources() override + { + m_data->bind_range(GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length); + } + + void run(const gl::buffer* data, u32 data_length, u32 data_offset = 0) + { + m_data = data; + m_data_offset = data_offset; + m_data_length = data_length; + + const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4; + const auto num_bytes_to_process = align(data_length, num_bytes_per_invocation); + const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation; + + if ((num_bytes_to_process + data_offset) > data->size()) + { + // Technically robust buffer access should keep the driver from crashing in OOB situations + LOG_ERROR(RSX, "Inadequate buffer length submitted for a compute operation." + "Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size()); + } + + compute_task::run(num_invocations); + } + }; + + struct cs_shuffle_16 : cs_shuffle_base + { + // byteswap ushort + cs_shuffle_16() + { + cs_shuffle_base::build("bswap_u16"); + } + }; + + struct cs_shuffle_32 : cs_shuffle_base + { + // byteswap_ulong + cs_shuffle_32() + { + cs_shuffle_base::build("bswap_u32"); + } + }; + + struct cs_shuffle_32_16 : cs_shuffle_base + { + // byteswap_ulong + byteswap_ushort + cs_shuffle_32_16() + { + cs_shuffle_base::build("bswap_u16_u32"); + } + }; + + struct cs_shuffle_d24x8_f32 : cs_shuffle_base + { + // convert d24x8 to f32 + cs_shuffle_d24x8_f32() + { + cs_shuffle_base::build("d24x8_to_f32"); + } + }; + + struct cs_shuffle_se_f32_d24x8 : cs_shuffle_base + { + // convert f32 to d24x8 and swap endianness + cs_shuffle_se_f32_d24x8() + { + cs_shuffle_base::build("f32_to_d24x8_swapped"); + } + }; + + struct cs_shuffle_se_d24x8 : cs_shuffle_base + { + // swap endianness of d24x8 + cs_shuffle_se_d24x8() + { + cs_shuffle_base::build("d24x8_to_d24x8_swapped"); + } + }; + + // NOTE: D24S8 layout has the stencil in the MSB! Its actually S8|D24|S8|D24 starting at offset 0 + struct cs_interleave_task : cs_shuffle_base + { + cs_interleave_task() + { + uniforms = + " uniform uint block_length;\n" + " uniform uint z_offset;\n" + " uniform uint s_offset;\n"; + + variables = + " uint depth;\n" + " uint stencil;\n" + " uint stencil_shift;\n" + " uint stencil_offset;\n"; + } + + void run(const gl::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset) + { + m_program.uniforms["block_length"] = data_length; + m_program.uniforms["z_offset"] = zeta_offset - data_offset; + m_program.uniforms["s_offset"] = stencil_offset - data_offset; + cs_shuffle_base::run(data, data_length, data_offset); + } + }; + + template + struct cs_gather_d24x8 : cs_interleave_task + { + cs_gather_d24x8() + { + work_kernel = + " if (index >= block_length)\n" + " return;\n" + "\n" + " depth = data[index + z_offset] & 0x00FFFFFF;\n" + " stencil_offset = (index / 4);\n" + " stencil_shift = (index % 4) * 8;\n" + " stencil = data[stencil_offset + s_offset];\n" + " stencil = (stencil >> stencil_shift) & 0xFF;\n" + " value = (depth << 8) | stencil;\n"; + + if constexpr (!_SwapBytes) + { + work_kernel += + " data[index] = value;\n"; + } + else + { + work_kernel += + " data[index] = bswap_u32(value);\n"; + } + + cs_shuffle_base::build(""); + } + }; + + template + struct cs_gather_d32x8 : cs_interleave_task + { + cs_gather_d32x8() + { + work_kernel = + " if (index >= block_length)\n" + " return;\n" + "\n" + " depth = f32_to_d24(data[index + z_offset]);\n" + " stencil_offset = (index / 4);\n" + " stencil_shift = (index % 4) * 8;\n" + " stencil = data[stencil_offset + s_offset];\n" + " stencil = (stencil >> stencil_shift) & 0xFF;\n" + " value = (depth << 8) | stencil;\n"; + + if constexpr (!_SwapBytes) + { + work_kernel += + " data[index] = value;\n"; + } + else + { + work_kernel += + " data[index] = bswap_u32(value);\n"; + } + + cs_shuffle_base::build(""); + } + }; + + struct cs_scatter_d24x8 : cs_interleave_task + { + cs_scatter_d24x8() + { + work_kernel = + " if (index >= block_length)\n" + " return;\n" + "\n" + " value = data[index];\n" + " data[index + z_offset] = (value >> 8);\n" + " stencil_offset = (index / 4);\n" + " stencil_shift = (index % 4) * 8;\n" + " stencil = (value & 0xFF) << stencil_shift;\n" + " data[stencil_offset + s_offset] |= stencil;\n"; + + cs_shuffle_base::build(""); + } + }; + + struct cs_scatter_d32x8 : cs_interleave_task + { + cs_scatter_d32x8() + { + work_kernel = + " if (index >= block_length)\n" + " return;\n" + "\n" + " value = data[index];\n" + " data[index + z_offset] = d24_to_f32(value >> 8);\n" + " stencil_offset = (index / 4);\n" + " stencil_shift = (index % 4) * 8;\n" + " stencil = (value & 0xFF) << stencil_shift;\n" + " data[stencil_offset + s_offset] |= stencil;\n"; + + cs_shuffle_base::build(""); + } + }; + + // TODO: Replace with a proper manager + extern std::unordered_map> g_compute_tasks; + + template + T* get_compute_task() + { + u32 index = id_manager::typeinfo::get_index(); + auto &e = g_compute_tasks[index]; + + if (!e) + { + e = std::make_unique(); + e->create(); + } + + return static_cast(e.get()); + } +} \ No newline at end of file diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index 411120903c..ad73f5d029 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -1334,7 +1334,7 @@ void GLGSRender::load_program_env() *(reinterpret_cast(buf + 136)) = rsx::method_registers.clip_min(); *(reinterpret_cast(buf + 140)) = rsx::method_registers.clip_max(); - m_vertex_env_buffer->bind_range(0, mapping.second, 144); + m_vertex_env_buffer->bind_range(GL_VERTEX_PARAMS_BIND_SLOT, mapping.second, 144); } if (update_transform_constants) @@ -1344,7 +1344,7 @@ void GLGSRender::load_program_env() auto buf = static_cast(mapping.first); fill_vertex_program_constants_data(buf); - m_transform_constants_buffer->bind_range(2, mapping.second, 8192); + m_transform_constants_buffer->bind_range(GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT, mapping.second, 8192); } if (update_fragment_constants) @@ -1356,7 +1356,7 @@ void GLGSRender::load_program_env() m_prog_buffer.fill_fragment_constants_buffer({ reinterpret_cast(buf), gsl::narrow(fragment_constants_size) }, current_fragment_program, gl::get_driver_caps().vendor_NVIDIA); - m_fragment_constants_buffer->bind_range(3, mapping.second, fragment_constants_size); + m_fragment_constants_buffer->bind_range(GL_FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT, mapping.second, fragment_constants_size); } if (update_fragment_env) @@ -1366,7 +1366,7 @@ void GLGSRender::load_program_env() auto buf = static_cast(mapping.first); fill_fragment_state_buffer(buf, current_fragment_program); - m_fragment_env_buffer->bind_range(4, mapping.second, 32); + m_fragment_env_buffer->bind_range(GL_FRAGMENT_STATE_BIND_SLOT, mapping.second, 32); } if (update_fragment_texture_env) @@ -1376,7 +1376,7 @@ void GLGSRender::load_program_env() auto buf = static_cast(mapping.first); fill_fragment_texture_parameters(buf, current_fragment_program); - m_texture_parameters_buffer->bind_range(5, mapping.second, 256); + m_texture_parameters_buffer->bind_range(GL_FRAGMENT_TEXTURE_PARAMS_BIND_SLOT, mapping.second, 256); } if (manually_flush_ring_buffers) @@ -1409,7 +1409,7 @@ void GLGSRender::update_vertex_env(const gl::vertex_upload_info& upload_info) fill_vertex_layout_state(m_vertex_layout, upload_info.first_vertex, upload_info.allocated_vertex_count, (s32*)buf, upload_info.persistent_mapping_offset, upload_info.volatile_mapping_offset); - m_vertex_layout_buffer->bind_range(1, mapping.second, 128 + 16); + m_vertex_layout_buffer->bind_range(GL_VERTEX_LAYOUT_BIND_SLOT, mapping.second, 128 + 16); if (manually_flush_ring_buffers) { diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.cpp b/rpcs3/Emu/RSX/GL/GLHelpers.cpp index 80f847c0b2..c45660ae35 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.cpp +++ b/rpcs3/Emu/RSX/GL/GLHelpers.cpp @@ -1,10 +1,12 @@ #include "stdafx.h" #include "GLHelpers.h" #include "GLTexture.h" +#include "GLCompute.h" #include "Utilities/Log.h" namespace gl { + std::unordered_map> g_compute_tasks; blitter *g_hw_blitter = nullptr; capabilities g_driver_caps; const fbo screen{}; diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h index e89e7b662b..ec7b25483b 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.h +++ b/rpcs3/Emu/RSX/GL/GLHelpers.h @@ -21,6 +21,14 @@ #define GL_STENCIL_MIRRORS_START (GL_VERTEX_TEXTURES_START + 4) #define GL_STREAM_BUFFER_START (GL_STENCIL_MIRRORS_START + 16) +#define GL_VERTEX_PARAMS_BIND_SLOT 0 +#define GL_VERTEX_LAYOUT_BIND_SLOT 1 +#define GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT 2 +#define GL_FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT 3 +#define GL_FRAGMENT_STATE_BIND_SLOT 4 +#define GL_FRAGMENT_TEXTURE_PARAMS_BIND_SLOT 5 +#define GL_COMPUTE_BUFFER_SLOT(index) (index + 6) + inline static void _SelectTexture(int unit) { glActiveTexture(GL_TEXTURE0 + unit); } namespace gl @@ -904,6 +912,11 @@ namespace gl verify(HERE), m_memory_type == memory_type::host_visible; glUnmapBuffer((GLenum)current_target()); } + + void bind_range(u32 index, u32 offset, u32 size) const + { + glBindBufferRange((GLenum)current_target(), index, id(), offset, size); + } }; class ring_buffer : public buffer @@ -991,11 +1004,6 @@ namespace gl virtual void unmap() {} - void bind_range(u32 index, u32 offset, u32 size) const - { - glBindBufferRange((GLenum)current_target(), index, id(), offset, size); - } - //Notification of a draw command virtual void notify() { @@ -2383,7 +2391,7 @@ public: { fragment = GL_FRAGMENT_SHADER, vertex = GL_VERTEX_SHADER, - geometry = GL_GEOMETRY_SHADER + compute = GL_COMPUTE_SHADER }; private: @@ -2533,6 +2541,7 @@ public: } void operator = (int rhs) const { glProgramUniform1i(m_program.id(), location(), rhs); } + void operator = (unsigned rhs) const { glProgramUniform1ui(m_program.id(), location(), rhs); } void operator = (float rhs) const { glProgramUniform1f(m_program.id(), location(), rhs); } void operator = (const color1i& rhs) const { glProgramUniform1i(m_program.id(), location(), rhs.r); } void operator = (const color1f& rhs) const { glProgramUniform1f(m_program.id(), location(), rhs.r); }