gl: Add a framework for compute shaders

2019-10-02 02:26:29 +03:00 · 2019-10-02 02:26:29 +03:00 · 7a6e2e716f
parent 9bcd7b534b
commit 7a6e2e716f
4 changed files with 416 additions and 12 deletions
--- a/rpcs3/Emu/RSX/GL/GLCompute.h
+++ b/rpcs3/Emu/RSX/GL/GLCompute.h
@ -0,0 +1,393 @@
+#pragma once
+
+#include "Utilities/StrUtil.h"
+#include "GLHelpers.h"
+
+namespace gl
+{
+    struct compute_task
+    {
+        std::string m_src;
+        gl::glsl::shader m_shader;
+        gl::glsl::program m_program;
+        bool compiled = false;
+
+        // Device-specific options
+		bool unroll_loops = true;
+		u32 optimal_group_size = 1;
+		u32 optimal_kernel_size = 1;
+
+        void create()
+        {
+            if (!compiled)
+            {
+                m_shader.create(gl::glsl::shader::type::compute);
+                m_shader.source(m_src);
+                m_shader.compile();
+
+                m_program.create();
+                m_program.attach(m_shader);
+                m_program.make();
+
+                compiled = true;
+            }
+        }
+
+        void destroy()
+        {
+            if (compiled)
+            {
+                m_program.remove();
+                m_shader.remove();
+
+                compiled = false;
+            }
+        }
+
+        virtual void bind_resources()
+        {}
+
+        void run(u32 invocations_x, u32 invocations_y)
+        {
+            GLint old_program;
+            glGetIntegerv(GL_CURRENT_PROGRAM, &old_program);
+
+            m_program.use();
+            glDispatchCompute(invocations_x, invocations_y, 1);
+
+            glUseProgram((GLuint)old_program);
+        }
+
+        void run(u32 num_invocations)
+        {
+            run(num_invocations, 1);   
+        }
+    };
+
+	struct cs_shuffle_base : compute_task
+	{
+		const gl::buffer* m_data = nullptr;
+		u32 m_data_offset = 0;
+		u32 m_data_length = 0;
+		u32 kernel_size = 1;
+
+		std::string uniforms, variables, work_kernel, loop_advance, suffix;
+
+		cs_shuffle_base()
+		{
+			work_kernel =
+				"		value = data[index];\n"
+				"		data[index] = %f(value);\n";
+
+			loop_advance =
+				"		index++;\n";
+
+			suffix =
+				"}\n";
+		}
+
+		void build(const char* function_name, u32 _kernel_size = 0)
+		{
+			// Initialize to allow detecting optimal settings
+			create();
+
+			kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
+
+			m_src =
+				"#version 430\n"
+				"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
+				"layout(binding=%loc, std430) buffer ssbo{ uint data[]; };\n"
+				"%ub"
+				"\n"
+				"#define KERNEL_SIZE %ks\n"
+				"\n"
+				"// Generic swap routines\n"
+				"#define bswap_u16(bits)     (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
+				"#define bswap_u32(bits)     (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
+				"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
+				"\n"
+				"// Depth format conversions\n"
+				"#define d24_to_f32(bits)             floatBitsToUint(float(bits) / 16777215.f)\n"
+				"#define f32_to_d24(bits)             uint(uintBitsToFloat(bits) * 16777215.f)\n"
+				"#define d24x8_to_f32(bits)           d24_to_f32(bits >> 8)\n"
+				"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
+				"#define f32_to_d24x8_swapped(bits)   d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
+				"\n"
+				"void main()\n"
+				"{\n"
+				"	uint index = gl_GlobalInvocationID.x * KERNEL_SIZE;\n"
+				"	uint value;\n"
+				"	%vars"
+				"\n";
+
+			const std::pair<std::string, std::string> syntax_replace[] =
+			{
+                { "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) },
+				{ "%ws", std::to_string(optimal_group_size) },
+				{ "%ks", std::to_string(kernel_size) },
+				{ "%vars", variables },
+				{ "%f", function_name },
+				{ "%ub", uniforms },
+			};
+
+			m_src = fmt::replace_all(m_src, syntax_replace);
+			work_kernel = fmt::replace_all(work_kernel, syntax_replace);
+
+			if (kernel_size <= 1)
+			{
+				m_src += "	{\n" + work_kernel + "	}\n";
+			}
+			else if (unroll_loops)
+			{
+				work_kernel += loop_advance + "\n";
+
+				m_src += std::string
+				(
+					"	//Unrolled loop\n"
+					"	{\n"
+				);
+
+				// Assemble body with manual loop unroll to try loweing GPR usage
+				for (u32 n = 0; n < kernel_size; ++n)
+				{
+					m_src += work_kernel;
+				}
+
+				m_src += "	}\n";
+			}
+			else
+			{
+				m_src += "	for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
+				m_src += "	{\n";
+				m_src += work_kernel;
+				m_src += loop_advance;
+				m_src += "	}\n";
+			}
+
+			m_src += suffix;
+		}
+
+		void bind_resources() override
+		{
+            m_data->bind_range(GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length);
+		}
+
+		void run(const gl::buffer* data, u32 data_length, u32 data_offset = 0)
+		{
+			m_data = data;
+			m_data_offset = data_offset;
+			m_data_length = data_length;
+
+			const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
+			const auto num_bytes_to_process = align(data_length, num_bytes_per_invocation);
+			const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
+
+			if ((num_bytes_to_process + data_offset) > data->size())
+			{
+				// Technically robust buffer access should keep the driver from crashing in OOB situations
+				LOG_ERROR(RSX, "Inadequate buffer length submitted for a compute operation."
+					"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
+			}
+
+			compute_task::run(num_invocations);
+		}
+	};
+
+	struct cs_shuffle_16 : cs_shuffle_base
+	{
+		// byteswap ushort
+		cs_shuffle_16()
+		{
+			cs_shuffle_base::build("bswap_u16");
+		}
+	};
+
+	struct cs_shuffle_32 : cs_shuffle_base
+	{
+		// byteswap_ulong
+		cs_shuffle_32()
+		{
+			cs_shuffle_base::build("bswap_u32");
+		}
+	};
+
+	struct cs_shuffle_32_16 : cs_shuffle_base
+	{
+		// byteswap_ulong + byteswap_ushort
+		cs_shuffle_32_16()
+		{
+			cs_shuffle_base::build("bswap_u16_u32");
+		}
+	};
+
+	struct cs_shuffle_d24x8_f32 : cs_shuffle_base
+	{
+		// convert d24x8 to f32
+		cs_shuffle_d24x8_f32()
+		{
+			cs_shuffle_base::build("d24x8_to_f32");
+		}
+	};
+
+	struct cs_shuffle_se_f32_d24x8 : cs_shuffle_base
+	{
+		// convert f32 to d24x8 and swap endianness
+		cs_shuffle_se_f32_d24x8()
+		{
+			cs_shuffle_base::build("f32_to_d24x8_swapped");
+		}
+	};
+
+	struct cs_shuffle_se_d24x8 : cs_shuffle_base
+	{
+		// swap endianness of d24x8
+		cs_shuffle_se_d24x8()
+		{
+			cs_shuffle_base::build("d24x8_to_d24x8_swapped");
+		}
+	};
+
+	// NOTE: D24S8 layout has the stencil in the MSB! Its actually S8|D24|S8|D24 starting at offset 0
+	struct cs_interleave_task : cs_shuffle_base
+	{
+		cs_interleave_task()
+		{
+            uniforms =
+            "   uniform uint block_length;\n"
+            "   uniform uint z_offset;\n"
+            "   uniform uint s_offset;\n";
+
+			variables =
+				"	uint depth;\n"
+				"	uint stencil;\n"
+				"	uint stencil_shift;\n"
+				"	uint stencil_offset;\n";
+		}
+
+		void run(const gl::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
+		{
+            m_program.uniforms["block_length"] = data_length;
+            m_program.uniforms["z_offset"] = zeta_offset - data_offset;
+            m_program.uniforms["s_offset"] = stencil_offset - data_offset;
+			cs_shuffle_base::run(data, data_length, data_offset);
+		}
+	};
+
+	template<bool _SwapBytes = false>
+	struct cs_gather_d24x8 : cs_interleave_task
+	{
+		cs_gather_d24x8()
+		{
+			work_kernel =
+				"		if (index >= block_length)\n"
+				"			return;\n"
+				"\n"
+				"		depth = data[index + z_offset] & 0x00FFFFFF;\n"
+				"		stencil_offset = (index / 4);\n"
+				"		stencil_shift = (index % 4) * 8;\n"
+				"		stencil = data[stencil_offset + s_offset];\n"
+				"		stencil = (stencil >> stencil_shift) & 0xFF;\n"
+				"		value = (depth << 8) | stencil;\n";
+
+			if constexpr (!_SwapBytes)
+			{
+				work_kernel +=
+				"		data[index] = value;\n";
+			}
+			else
+			{
+				work_kernel +=
+				"		data[index] = bswap_u32(value);\n";
+			}
+
+			cs_shuffle_base::build("");
+		}
+	};
+
+	template<bool _SwapBytes = false>
+	struct cs_gather_d32x8 : cs_interleave_task
+	{
+		cs_gather_d32x8()
+		{
+			work_kernel =
+				"		if (index >= block_length)\n"
+				"			return;\n"
+				"\n"
+				"		depth = f32_to_d24(data[index + z_offset]);\n"
+				"		stencil_offset = (index / 4);\n"
+				"		stencil_shift = (index % 4) * 8;\n"
+				"		stencil = data[stencil_offset + s_offset];\n"
+				"		stencil = (stencil >> stencil_shift) & 0xFF;\n"
+				"		value = (depth << 8) | stencil;\n";
+
+			if constexpr (!_SwapBytes)
+			{
+				work_kernel +=
+				"		data[index] = value;\n";
+			}
+			else
+			{
+				work_kernel +=
+				"		data[index] = bswap_u32(value);\n";
+			}
+
+			cs_shuffle_base::build("");
+		}
+	};
+
+	struct cs_scatter_d24x8 : cs_interleave_task
+	{
+		cs_scatter_d24x8()
+		{
+			work_kernel =
+				"		if (index >= block_length)\n"
+				"			return;\n"
+				"\n"
+				"		value = data[index];\n"
+				"		data[index + z_offset] = (value >> 8);\n"
+				"		stencil_offset = (index / 4);\n"
+				"		stencil_shift = (index % 4) * 8;\n"
+				"		stencil = (value & 0xFF) << stencil_shift;\n"
+				"		data[stencil_offset + s_offset] |= stencil;\n";
+
+			cs_shuffle_base::build("");
+		}
+	};
+
+	struct cs_scatter_d32x8 : cs_interleave_task
+	{
+		cs_scatter_d32x8()
+		{
+			work_kernel =
+				"		if (index >= block_length)\n"
+				"			return;\n"
+				"\n"
+				"		value = data[index];\n"
+				"		data[index + z_offset] = d24_to_f32(value >> 8);\n"
+				"		stencil_offset = (index / 4);\n"
+				"		stencil_shift = (index % 4) * 8;\n"
+				"		stencil = (value & 0xFF) << stencil_shift;\n"
+				"		data[stencil_offset + s_offset] |= stencil;\n";
+
+			cs_shuffle_base::build("");
+		}
+	};
+
+	// TODO: Replace with a proper manager
+	extern std::unordered_map<u32, std::unique_ptr<gl::compute_task>> g_compute_tasks;
+
+	template<class T>
+	T* get_compute_task()
+	{
+		u32 index = id_manager::typeinfo::get_index<T>();
+		auto &e = g_compute_tasks[index];
+
+		if (!e)
+		{
+			e = std::make_unique<T>();
+			e->create();
+		}
+
+		return static_cast<T*>(e.get());
+	}
+}
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@ -1334,7 +1334,7 @@ void GLGSRender::load_program_env()
 		*(reinterpret_cast<f32*>(buf + 136)) = rsx::method_registers.clip_min();
 		*(reinterpret_cast<f32*>(buf + 140)) = rsx::method_registers.clip_max();

-		m_vertex_env_buffer->bind_range(0, mapping.second, 144);
+		m_vertex_env_buffer->bind_range(GL_VERTEX_PARAMS_BIND_SLOT, mapping.second, 144);
 	}

 	if (update_transform_constants)
@ -1344,7 +1344,7 @@ void GLGSRender::load_program_env()
 		auto buf = static_cast<u8*>(mapping.first);
 		fill_vertex_program_constants_data(buf);

-		m_transform_constants_buffer->bind_range(2, mapping.second, 8192);
+		m_transform_constants_buffer->bind_range(GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT, mapping.second, 8192);
 	}

 	if (update_fragment_constants)
@ -1356,7 +1356,7 @@ void GLGSRender::load_program_env()
 		m_prog_buffer.fill_fragment_constants_buffer({ reinterpret_cast<float*>(buf), gsl::narrow<int>(fragment_constants_size) },
 			current_fragment_program, gl::get_driver_caps().vendor_NVIDIA);

-		m_fragment_constants_buffer->bind_range(3, mapping.second, fragment_constants_size);
+		m_fragment_constants_buffer->bind_range(GL_FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT, mapping.second, fragment_constants_size);
 	}

 	if (update_fragment_env)
@ -1366,7 +1366,7 @@ void GLGSRender::load_program_env()
 		auto buf = static_cast<u8*>(mapping.first);
 		fill_fragment_state_buffer(buf, current_fragment_program);

-		m_fragment_env_buffer->bind_range(4, mapping.second, 32);
+		m_fragment_env_buffer->bind_range(GL_FRAGMENT_STATE_BIND_SLOT, mapping.second, 32);
 	}

 	if (update_fragment_texture_env)
@ -1376,7 +1376,7 @@ void GLGSRender::load_program_env()
 		auto buf = static_cast<u8*>(mapping.first);
 		fill_fragment_texture_parameters(buf, current_fragment_program);

-		m_texture_parameters_buffer->bind_range(5, mapping.second, 256);
+		m_texture_parameters_buffer->bind_range(GL_FRAGMENT_TEXTURE_PARAMS_BIND_SLOT, mapping.second, 256);
 	}

 	if (manually_flush_ring_buffers)
@ -1409,7 +1409,7 @@ void GLGSRender::update_vertex_env(const gl::vertex_upload_info& upload_info)

 	fill_vertex_layout_state(m_vertex_layout, upload_info.first_vertex, upload_info.allocated_vertex_count, (s32*)buf, upload_info.persistent_mapping_offset, upload_info.volatile_mapping_offset);

-	m_vertex_layout_buffer->bind_range(1, mapping.second, 128 + 16);
+	m_vertex_layout_buffer->bind_range(GL_VERTEX_LAYOUT_BIND_SLOT, mapping.second, 128 + 16);

 	if (manually_flush_ring_buffers)
 	{
--- a/rpcs3/Emu/RSX/GL/GLHelpers.cpp
+++ b/rpcs3/Emu/RSX/GL/GLHelpers.cpp
@ -1,10 +1,12 @@
 #include "stdafx.h"
 #include "GLHelpers.h"
 #include "GLTexture.h"
+#include "GLCompute.h"
 #include "Utilities/Log.h"

 namespace gl
 {
+	std::unordered_map<u32, std::unique_ptr<gl::compute_task>> g_compute_tasks;
 	blitter *g_hw_blitter = nullptr;
 	capabilities g_driver_caps;
 	const fbo screen{};
--- a/rpcs3/Emu/RSX/GL/GLHelpers.h
+++ b/rpcs3/Emu/RSX/GL/GLHelpers.h
@ -21,6 +21,14 @@
 #define GL_STENCIL_MIRRORS_START   (GL_VERTEX_TEXTURES_START + 4)
 #define GL_STREAM_BUFFER_START     (GL_STENCIL_MIRRORS_START + 16)

+#define GL_VERTEX_PARAMS_BIND_SLOT 0
+#define GL_VERTEX_LAYOUT_BIND_SLOT 1
+#define GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT 2
+#define GL_FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT 3
+#define GL_FRAGMENT_STATE_BIND_SLOT 4
+#define GL_FRAGMENT_TEXTURE_PARAMS_BIND_SLOT 5
+#define GL_COMPUTE_BUFFER_SLOT(index) (index + 6)
+
 inline static void _SelectTexture(int unit) { glActiveTexture(GL_TEXTURE0 + unit); }

 namespace gl
@ -904,6 +912,11 @@ namespace gl
 			verify(HERE), m_memory_type == memory_type::host_visible;
 			glUnmapBuffer((GLenum)current_target());
 		}
+
+		void bind_range(u32 index, u32 offset, u32 size) const
+		{
+			glBindBufferRange((GLenum)current_target(), index, id(), offset, size);
+		}
 	};

 	class ring_buffer : public buffer
@ -991,11 +1004,6 @@ namespace gl

 		virtual void unmap() {}

-		void bind_range(u32 index, u32 offset, u32 size) const
-		{
-			glBindBufferRange((GLenum)current_target(), index, id(), offset, size);
-		}
-
 		//Notification of a draw command
 		virtual void notify()
 		{
@ -2383,7 +2391,7 @@ public:
 			{
 				fragment = GL_FRAGMENT_SHADER,
 				vertex = GL_VERTEX_SHADER,
-				geometry = GL_GEOMETRY_SHADER
+				compute = GL_COMPUTE_SHADER
 			};

 		private:
@ -2533,6 +2541,7 @@ public:
 				}

 				void operator = (int rhs) const { glProgramUniform1i(m_program.id(), location(), rhs); }
+				void operator = (unsigned rhs) const { glProgramUniform1ui(m_program.id(), location(), rhs); }
 				void operator = (float rhs) const { glProgramUniform1f(m_program.id(), location(), rhs); }
 				void operator = (const color1i& rhs) const { glProgramUniform1i(m_program.id(), location(), rhs.r); }
 				void operator = (const color1f& rhs) const { glProgramUniform1f(m_program.id(), location(), rhs.r); }