diff --git a/rpcs3/Emu/RSX/GL/GLCompute.cpp b/rpcs3/Emu/RSX/GL/GLCompute.cpp index dd6ba7d276..091298a115 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.cpp +++ b/rpcs3/Emu/RSX/GL/GLCompute.cpp @@ -102,46 +102,19 @@ namespace gl kernel_size = _kernel_size? _kernel_size : optimal_kernel_size; m_src = - "#version 430\n" - "layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n" - "layout(binding=%loc, std430) buffer ssbo{ uint data[]; };\n" - "%ub" - "\n" - "#define KERNEL_SIZE %ks\n" - "\n" - "// Generic swap routines\n" - "#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n" - "#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n" - "#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n" - "\n" - "// Depth format conversions\n" - "#define d24f_to_f32(bits) (bits << 7)\n" - "#define f32_to_d24f(bits) (bits >> 7)\n" - "\n" - "uint linear_invocation_id()\n" - "{\n" - " uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);\n" - " return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;\n" - "}\n" - "\n" - "%md" - "void main()\n" - "{\n" - " uint invocation_id = linear_invocation_id();\n" - " uint index = invocation_id * KERNEL_SIZE;\n" - " uint value;\n" - " %vars" - "\n"; + #include "../Program/GLSLSnippets/ShuffleBytes.glsl" + ; const std::pair syntax_replace[] = { + { "%set, ", ""}, { "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) }, { "%ws", std::to_string(optimal_group_size) }, { "%ks", std::to_string(kernel_size) }, { "%vars", variables }, { "%f", function_name }, { "%ub", uniforms }, - { "%md", method_declarations } + { "%md", method_declarations }, }; m_src = fmt::replace_all(m_src, syntax_replace); diff --git a/rpcs3/Emu/RSX/GL/GLCompute.h b/rpcs3/Emu/RSX/GL/GLCompute.h index 6bef8e33ed..06708b09e3 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.h +++ b/rpcs3/Emu/RSX/GL/GLCompute.h @@ -2,6 +2,7 @@ #include "Emu/IdManager.h" #include "GLHelpers.h" +#include "../rsx_utils.h" #include @@ -226,6 +227,116 @@ namespace gl } }; + // Reverse morton-order block arrangement + template + struct cs_deswizzle_3d : compute_task + { + union params_t + { + u32 data[7]; + + struct + { + u32 width; + u32 height; + u32 depth; + u32 logw; + u32 logh; + u32 logd; + u32 mipmaps; + }; + } + params; + + gl::buffer param_buffer; + + const gl::buffer* src_buffer = nullptr; + const gl::buffer* dst_buffer = nullptr; + u32 in_offset = 0; + u32 out_offset = 0; + u32 block_length = 0; + + cs_deswizzle_3d() + { + ensure((sizeof(_BlockType) & 3) == 0); // "Unsupported block type" + + m_src = + #include "../Program/GLSLSnippets/GPUDeswizzle.glsl" + ; + + std::string transform; + if constexpr (_SwapBytes) + { + if constexpr (sizeof(_BaseType) == 4) + { + transform = "bswap_u32"; + } + else if constexpr (sizeof(_BaseType) == 2) + { + transform = "bswap_u16"; + } + else + { + fmt::throw_exception("Unreachable"); + } + } + + const std::pair syntax_replace[] = + { + { "%set, ", ""}, + { "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0))}, + { "%push_block", fmt::format("binding=%d, std140", GL_COMPUTE_BUFFER_SLOT(2)) }, + { "%ws", std::to_string(optimal_group_size) }, + { "%_wordcount", std::to_string(sizeof(_BlockType) / 4) }, + { "%f", transform } + }; + + m_src = fmt::replace_all(m_src, syntax_replace); + + param_buffer.create(gl::buffer::target::uniform, 32, nullptr, gl::buffer::memory_type::local, GL_DYNAMIC_COPY); + } + + ~cs_deswizzle_3d() + { + param_buffer.remove(); + } + + void bind_resources() override + { + src_buffer->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), in_offset, block_length); + dst_buffer->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(1), in_offset, block_length); + param_buffer.bind_range(gl::buffer::target::uniform, GL_COMPUTE_BUFFER_SLOT(2), 0, sizeof(params)); + } + + void set_parameters(gl::command_context& /*cmd*/) + { + param_buffer.sub_data(0, sizeof(params), params.data); + } + + void run(gl::command_context& cmd, const gl::buffer* dst, u32 out_offset, const gl::buffer* src, u32 in_offset, u32 data_length, u32 width, u32 height, u32 depth, u32 mipmaps) + { + dst_buffer = dst; + src_buffer = src; + + this->in_offset = in_offset; + this->out_offset = out_offset; + this->block_length = data_length; + + params.width = width; + params.height = height; + params.depth = depth; + params.mipmaps = mipmaps; + params.logw = rsx::ceil_log2(width); + params.logh = rsx::ceil_log2(height); + params.logd = rsx::ceil_log2(depth); + set_parameters(cmd); + + const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size); + const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation); + compute_task::run(cmd, linear_invocations); + } + }; + // TODO: Replace with a proper manager extern std::unordered_map> g_compute_tasks; diff --git a/rpcs3/Emu/RSX/GL/GLDraw.cpp b/rpcs3/Emu/RSX/GL/GLDraw.cpp index 8122c7e36b..abef1af15a 100644 --- a/rpcs3/Emu/RSX/GL/GLDraw.cpp +++ b/rpcs3/Emu/RSX/GL/GLDraw.cpp @@ -495,6 +495,8 @@ void GLGSRender::emit_geometry(u32 sub_index) m_frame_stats.vertex_upload_time += m_profiler.duration(); + gl_state.use_program(m_program->id()); + if (!upload_info.index_info) { if (draw_call.is_single_draw()) diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index 3f2903bdc1..c9904a9e16 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -762,8 +762,6 @@ void GLGSRender::load_program_env() const bool update_instruction_buffers = (!!m_interpreter_state && m_shader_interpreter.is_interpreter(m_program)); const bool update_raster_env = (rsx::method_registers.polygon_stipple_enabled() && !!(m_graphics_state & rsx::pipeline_state::polygon_stipple_pattern_dirty)); - gl_state.use_program(m_program->id()); - if (manually_flush_ring_buffers) { if (update_fragment_env) m_fragment_env_buffer->reserve_storage_on_heap(128); @@ -801,13 +799,13 @@ void GLGSRender::load_program_env() const usz transform_constants_size = (!m_vertex_prog || m_vertex_prog->has_indexed_constants) ? 8192 : m_vertex_prog->constant_ids.size() * 16; if (transform_constants_size) { - auto mapping = m_transform_constants_buffer->alloc_from_heap(transform_constants_size, m_uniform_buffer_offset_align); + auto mapping = m_transform_constants_buffer->alloc_from_heap(static_cast(transform_constants_size), m_uniform_buffer_offset_align); auto buf = static_cast(mapping.first); const std::vector& constant_ids = (transform_constants_size == 8192) ? std::vector{} : m_vertex_prog->constant_ids; fill_vertex_program_constants_data(buf, constant_ids); - m_transform_constants_buffer->bind_range(GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT, mapping.second, transform_constants_size); + m_transform_constants_buffer->bind_range(GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT, mapping.second, static_cast(transform_constants_size)); } } diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.cpp b/rpcs3/Emu/RSX/GL/GLHelpers.cpp index 92a1311b38..8ac1c75361 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.cpp +++ b/rpcs3/Emu/RSX/GL/GLHelpers.cpp @@ -252,7 +252,7 @@ namespace gl void fbo::draw_buffers(const std::initializer_list& indexes) const { rsx::simple_array ids; - ids.reserve(indexes.size()); + ids.reserve(::size32(indexes)); for (auto &index : indexes) ids.push_back(index.id()); diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h index a62137aa0a..c91da6a9fb 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.h +++ b/rpcs3/Emu/RSX/GL/GLHelpers.h @@ -613,6 +613,14 @@ namespace gl fmt::throw_exception("Unsupported buffer usage 0x%x", usage); } } + else + { + // Local memory hints + if (usage == GL_DYNAMIC_COPY) + { + flags |= GL_DYNAMIC_STORAGE_BIT; + } + } if ((flags & GL_MAP_READ_BIT) && !caps.vendor_AMD) { @@ -624,7 +632,6 @@ namespace gl flags |= GL_CLIENT_STORAGE_BIT; } - save_binding_state save(current_target(), *this); DSA_CALL2(NamedBufferStorage, m_id, size, data_, flags); m_size = size; } @@ -674,6 +681,7 @@ namespace gl void create() { glGenBuffers(1, &m_id); + save_binding_state save(current_target(), *this); } void create(GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLenum usage = GL_STREAM_DRAW) @@ -684,8 +692,9 @@ namespace gl void create(target target_, GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLenum usage = GL_STREAM_DRAW) { - create(); m_target = target_; + + create(); allocate(size, data_, type, usage); } @@ -748,7 +757,7 @@ namespace gl void sub_data(GLsizeiptr offset, GLsizeiptr length, GLvoid* data) { - ensure(m_memory_type != memory_type::local); + ensure(m_memory_type == memory_type::local); DSA_CALL2(NamedBufferSubData, m_id, offset, length, data); } diff --git a/rpcs3/Emu/RSX/GL/GLTextOut.h b/rpcs3/Emu/RSX/GL/GLTextOut.h index 686008875e..814d8396c6 100644 --- a/rpcs3/Emu/RSX/GL/GLTextOut.h +++ b/rpcs3/Emu/RSX/GL/GLTextOut.h @@ -19,7 +19,6 @@ namespace gl gl::vao m_vao; gl::buffer m_text_buffer; - gl::buffer m_scale_offsets_buffer; std::unordered_map> m_offsets; bool initialized = false; @@ -87,19 +86,14 @@ namespace gl void init() { - m_text_buffer.create(); - m_scale_offsets_buffer.create(); - GlyphManager glyph_source; auto points = glyph_source.generate_point_map(); const usz buffer_size = points.size() * sizeof(GlyphManager::glyph_point); - m_text_buffer.data(buffer_size, points.data()); + m_text_buffer.create(gl::buffer::target::array, buffer_size, points.data(), gl::buffer::memory_type::host_visible); m_offsets = glyph_source.get_glyph_offsets(); - m_scale_offsets_buffer.data(512 * 4 * sizeof(float)); - //Init VAO int old_vao; glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao); @@ -198,7 +192,6 @@ namespace gl { if (initialized) { - m_scale_offsets_buffer.remove(); m_text_buffer.remove(); m_vao.remove(); diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index 2e2dc13eea..a453e815fb 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -21,13 +21,13 @@ namespace gl std::pair prepare_compute_resources(usz staging_data_length) { - if (g_upload_transfer_buffer.size() < staging_data_length) + if (g_upload_transfer_buffer.size() < static_cast(staging_data_length)) { g_upload_transfer_buffer.remove(); g_upload_transfer_buffer.create(staging_data_length, nullptr, buffer::memory_type::host_visible, GL_STREAM_DRAW); } - if (g_compute_decode_buffer.size() < staging_data_length * 3) + if (g_compute_decode_buffer.size() < static_cast(staging_data_length) * 3) { g_compute_decode_buffer.remove(); g_compute_decode_buffer.create(std::max(512, staging_data_length * 3), nullptr, buffer::memory_type::local, GL_STATIC_COPY); @@ -43,6 +43,31 @@ namespace gl g_compute_decode_buffer.remove(); } + template + void do_deswizzle_transformation(gl::command_context& cmd, u32 block_size, buffer* dst, buffer* src, u32 data_length, u16 width, u16 height, u16 depth) + { + switch (block_size) + { + case 4: + gl::get_compute_task>()->run( + cmd, dst, 0, src, 0, + data_length, width, height, depth, 1); + break; + case 8: + gl::get_compute_task>()->run( + cmd, dst, 0, src, 0, + data_length, width, height, depth, 1); + break; + case 16: + gl::get_compute_task>()->run( + cmd, dst, 0, src, 0, + data_length, width, height, depth, 1); + break; + default: + fmt::throw_exception("Unreachable"); + } + } + GLenum get_target(rsx::texture_dimension_extended type) { switch (type) @@ -623,11 +648,12 @@ namespace gl const std::vector &input_layouts, bool is_swizzled, GLenum gl_format, GLenum gl_type, std::vector& staging_buffer) { + const auto driver_caps = gl::get_driver_caps(); rsx::texture_uploader_capabilities caps { .supports_byteswap = true, .supports_vtc_decoding = false, - .supports_hw_deswizzle = false, + .supports_hw_deswizzle = driver_caps.ARB_compute_shader_supported, .supports_zero_copy = false, .alignment = 4 }; @@ -635,9 +661,12 @@ namespace gl pixel_unpack_settings unpack_settings; unpack_settings.row_length(0).alignment(4); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, GL_NONE); + glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); + if (rsx::is_compressed_host_format(format)) [[likely]] { - caps.supports_vtc_decoding = gl::get_driver_caps().vendor_NVIDIA; + caps.supports_vtc_decoding = driver_caps.vendor_NVIDIA; unpack_settings.apply(); glBindTexture(static_cast(dst->get_target()), dst->id()); @@ -688,7 +717,7 @@ namespace gl else { bool apply_settings = true; - bool use_compute_transform = false; + bool use_compute_transform = is_swizzled; buffer *upload_scratch_mem = nullptr, *compute_scratch_mem = nullptr; image_memory_requirements mem_info; pixel_buffer_layout mem_layout; @@ -698,6 +727,8 @@ namespace gl u8 block_size_in_bytes = rsx::get_format_block_size_in_bytes(format); u64 image_linear_size; + gl::buffer deswizzle_buf; + switch (gl_type) { case GL_BYTE: @@ -710,8 +741,6 @@ namespace gl case GL_FLOAT: case GL_UNSIGNED_INT_24_8: case GL_FLOAT_32_UNSIGNED_INT_24_8_REV: - mem_layout.format = gl_format; - mem_layout.type = gl_type; mem_layout.swap_bytes = true; mem_layout.size = 4; use_compute_transform = true; @@ -747,14 +776,61 @@ namespace gl if (use_compute_transform) { + // 0. Preconf + mem_layout.swap_bytes = op.require_swap; + mem_layout.format = gl_format; + mem_layout.type = gl_type; + // 1. Unmap buffer upload_scratch_mem->unmap(); // 2. Upload memory to GPU - upload_scratch_mem->copy_to(compute_scratch_mem, 0, 0, image_linear_size); + if (!op.require_deswizzle) + { + upload_scratch_mem->copy_to(compute_scratch_mem, 0, 0, image_linear_size); + } + else + { + // 2.1 Copy data to deswizzle buf + if (deswizzle_buf.size() < image_linear_size) + { + deswizzle_buf.remove(); + deswizzle_buf.create(gl::buffer::target::ssbo, image_linear_size, nullptr, gl::buffer::memory_type::local); + } + + upload_scratch_mem->copy_to(&deswizzle_buf, 0, 0, image_linear_size); + + // 2.2 Apply compute transform to deswizzle input and dump it in compute_scratch_mem + ensure(op.element_size == 2 || op.element_size == 4); + const auto block_size = op.element_size * op.block_length; + + if (op.require_swap) + { + mem_layout.swap_bytes = false; + + if (op.element_size == 4) [[ likely ]] + { + do_deswizzle_transformation(cmd, block_size, compute_scratch_mem, &deswizzle_buf, image_linear_size, layout.width_in_texel, layout.height_in_texel, layout.depth); + } + else + { + do_deswizzle_transformation(cmd, block_size, compute_scratch_mem, &deswizzle_buf, image_linear_size, layout.width_in_texel, layout.height_in_texel, layout.depth); + } + } + else + { + if (op.element_size == 4) [[ likely ]] + { + do_deswizzle_transformation(cmd, block_size, compute_scratch_mem, &deswizzle_buf, image_linear_size, layout.width_in_texel, layout.height_in_texel, layout.depth); + } + else + { + do_deswizzle_transformation(cmd, block_size, compute_scratch_mem, &deswizzle_buf, image_linear_size, layout.width_in_texel, layout.height_in_texel, layout.depth); + } + } + } // 3. Update configuration - mem_layout.swap_bytes = op.require_swap; mem_info.image_size_in_texels = image_linear_size / block_size_in_bytes; mem_info.image_size_in_bytes = image_linear_size; mem_info.memory_required = 0; @@ -773,6 +849,8 @@ namespace gl dst->copy_from(out_pointer, static_cast(gl_format), static_cast(gl_type), layout.level, region, unpack_settings); } } + + deswizzle_buf.remove(); } } diff --git a/rpcs3/Emu/RSX/Common/Interpreter/FragmentInterpreter.glsl b/rpcs3/Emu/RSX/Program/GLSLInterpreter/FragmentInterpreter.glsl similarity index 100% rename from rpcs3/Emu/RSX/Common/Interpreter/FragmentInterpreter.glsl rename to rpcs3/Emu/RSX/Program/GLSLInterpreter/FragmentInterpreter.glsl diff --git a/rpcs3/Emu/RSX/Common/Interpreter/VertexInterpreter.glsl b/rpcs3/Emu/RSX/Program/GLSLInterpreter/VertexInterpreter.glsl similarity index 100% rename from rpcs3/Emu/RSX/Common/Interpreter/VertexInterpreter.glsl rename to rpcs3/Emu/RSX/Program/GLSLInterpreter/VertexInterpreter.glsl diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/GPUDeswizzle.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/GPUDeswizzle.glsl new file mode 100644 index 0000000000..b34d3b3cba --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/GPUDeswizzle.glsl @@ -0,0 +1,130 @@ +R"( +#version 450 + +#define SSBO_BASE_LOCATION %loc +#define SSBO(x) (SSBO_BASE_LOCATION + x) + +layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in; + +layout(%set, binding=SSBO(0), std430) buffer ssbo0{ uint data_in[]; }; +layout(%set, binding=SSBO(1), std430) buffer ssbo1{ uint data_out[]; }; +layout(%push_block) uniform parameters +{ + uint image_width; + uint image_height; + uint image_depth; + uint image_logw; + uint image_logh; + uint image_logd; + uint lod_count; +}; + +struct invocation_properties +{ + uint data_offset; + uvec3 size; + uvec3 size_log2; +}; + +#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8 +#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24 + +invocation_properties invocation; + +bool init_invocation_properties(const in uint offset) +{ + invocation.data_offset = 0; + invocation.size.x = image_width; + invocation.size.y = image_height; + invocation.size.z = image_depth; + invocation.size_log2.x = image_logw; + invocation.size_log2.y = image_logh; + invocation.size_log2.z = image_logd; + uint level_end = image_width * image_height * image_depth; + uint level = 1; + + while (offset >= level_end && level < lod_count) + { + invocation.data_offset = level_end; + invocation.size.xy /= 2; + invocation.size.xy = max(invocation.size.xy, uvec2(1)); + invocation.size_log2.xy = max(invocation.size_log2.xy, uvec2(1)); + invocation.size_log2.xy --; + level_end += (invocation.size.x * invocation.size.y * image_depth); + level++; + } + + return (offset < level_end); +} + +uint get_z_index(const in uint x_, const in uint y_, const in uint z_) +{ + uint offset = 0; + uint shift = 0; + uint x = x_; + uint y = y_; + uint z = z_; + uint log2w = invocation.size_log2.x; + uint log2h = invocation.size_log2.y; + uint log2d = invocation.size_log2.z; + + do + { + if (log2w > 0) + { + offset |= (x & 1) << shift; + shift++; + x >>= 1; + log2w--; + } + + if (log2h > 0) + { + offset |= (y & 1) << shift; + shift++; + y >>= 1; + log2h--; + } + + if (log2d > 0) + { + offset |= (z & 1) << shift; + shift++; + z >>= 1; + log2d--; + } + } + while(x > 0 || y > 0 || z > 0); + + return offset; +} + +void main() +{ + uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x); + uint texel_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x; + uint word_count = %_wordcount; + + if (!init_invocation_properties(texel_id)) + return; + + // Calculations done in texels, not bytes + uint row_length = invocation.size.x; + uint slice_length = (invocation.size.y * row_length); + uint level_offset = (texel_id - invocation.data_offset); + uint slice_offset = (level_offset % slice_length); + uint z = (level_offset / slice_length); + uint y = (slice_offset / row_length); + uint x = (slice_offset % row_length); + + uint src_texel_id = get_z_index(x, y, z); + uint dst_id = (texel_id * word_count); + uint src_id = (src_texel_id + invocation.data_offset) * word_count; + + for (uint i = 0; i < word_count; ++i) + { + uint value = data_in[src_id++]; + data_out[dst_id++] = %f(value); + } +} +)" \ No newline at end of file diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/ShuffleBytes.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/ShuffleBytes.glsl new file mode 100644 index 0000000000..34ffb368ae --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/ShuffleBytes.glsl @@ -0,0 +1,32 @@ +R"( +#version 430 +layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in; +layout(%set, binding=%loc, std430) buffer ssbo{ uint data[]; }; +%ub + +#define KERNEL_SIZE %ks + +// Generic swap routines +#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8 +#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24 +#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16 + +// Depth format conversions +#define d24f_to_f32(bits) (bits << 7) +#define f32_to_d24f(bits) (bits >> 7) + +uint linear_invocation_id() +{ + uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x); + return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x; +} + +%md +void main() +{ + uint invocation_id = linear_invocation_id(); + uint index = invocation_id * KERNEL_SIZE; + uint value; + %vars + +)" diff --git a/rpcs3/Emu/RSX/Program/ShaderInterpreter.h b/rpcs3/Emu/RSX/Program/ShaderInterpreter.h index 06a81501a7..a397aa2cb3 100644 --- a/rpcs3/Emu/RSX/Program/ShaderInterpreter.h +++ b/rpcs3/Emu/RSX/Program/ShaderInterpreter.h @@ -24,7 +24,7 @@ namespace program_common static std::string get_vertex_interpreter() { const char* s = - #include "../Common/Interpreter/VertexInterpreter.glsl" + #include "../Program/GLSLInterpreter/VertexInterpreter.glsl" ; return s; } @@ -32,7 +32,7 @@ namespace program_common static std::string get_fragment_interpreter() { const char* s = - #include "../Common/Interpreter/FragmentInterpreter.glsl" + #include "../Program/GLSLInterpreter/FragmentInterpreter.glsl" ; return s; } diff --git a/rpcs3/Emu/RSX/VK/VKCompute.cpp b/rpcs3/Emu/RSX/VK/VKCompute.cpp index a4dcb838f6..d2079e84f6 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.cpp +++ b/rpcs3/Emu/RSX/VK/VKCompute.cpp @@ -224,40 +224,14 @@ namespace vk kernel_size = _kernel_size? _kernel_size : optimal_kernel_size; m_src = - "#version 430\n" - "layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n" - "layout(std430, set=0, binding=0) buffer ssbo{ uint data[]; };\n" - "%ub" - "\n" - "#define KERNEL_SIZE %ks\n" - "\n" - "// Generic swap routines\n" - "#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n" - "#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n" - "#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n" - "\n" - "// Depth format conversions\n" - "#define d24_to_f32(bits) floatBitsToUint(float(bits) / 16777215.f)\n" - "#define f32_to_d24(bits) uint(uintBitsToFloat(bits) * 16777215.f)\n" - "#define d24f_to_f32(bits) (bits << 7)\n" - "#define f32_to_d24f(bits) (bits >> 7)\n" - "#define d24x8_to_f32(bits) d24_to_f32(bits >> 8)\n" - "#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n" - "#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n" - "\n" - "%md" - "void main()\n" - "{\n" - " uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);" - " uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n" - " uint index = invocation_id * KERNEL_SIZE;\n" - " uint value;\n" - "%vars" - "\n"; + #include "../Program/GLSLSnippets/ShuffleBytes.glsl" + ; const auto parameters_size = utils::align(push_constants_size, 16) / 16; const std::pair syntax_replace[] = { + { "%loc", "0" }, + { "%set", "set = 0"}, { "%ws", std::to_string(optimal_group_size) }, { "%ks", std::to_string(kernel_size) }, { "%vars", variables }, diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index 8c1ebbfffd..790238178c 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -422,131 +422,8 @@ namespace vk create(); m_src = - "#version 450\n" - "layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n" - - "layout(set=0, binding=0, std430) buffer ssbo0{ uint data_in[]; };\n" - "layout(set=0, binding=1, std430) buffer ssbo1{ uint data_out[]; };\n" - "layout(push_constant) uniform parameters\n" - "{\n" - " uint image_width;\n" - " uint image_height;\n" - " uint image_depth;\n" - " uint image_logw;\n" - " uint image_logh;\n" - " uint image_logd;\n" - " uint lod_count;\n" - "};\n\n" - - "struct invocation_properties\n" - "{\n" - " uint data_offset;\n" - " uvec3 size;\n" - " uvec3 size_log2;\n" - "};\n\n" - - "#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n" - "#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n\n" - - "invocation_properties invocation;\n\n" - - "bool init_invocation_properties(const in uint offset)\n" - "{\n" - " invocation.data_offset = 0;\n" - " invocation.size.x = image_width;\n" - " invocation.size.y = image_height;\n" - " invocation.size.z = image_depth;\n" - " invocation.size_log2.x = image_logw;\n" - " invocation.size_log2.y = image_logh;\n" - " invocation.size_log2.z = image_logd;\n" - " uint level_end = image_width * image_height * image_depth;\n" - " uint level = 1;\n\n" - - " while (offset >= level_end && level < lod_count)\n" - " {\n" - " invocation.data_offset = level_end;\n" - " invocation.size.xy /= 2;\n" - " invocation.size.xy = max(invocation.size.xy, uvec2(1));\n" - " invocation.size_log2.xy = max(invocation.size_log2.xy, uvec2(1));\n" - " invocation.size_log2.xy --;\n" - " level_end += (invocation.size.x * invocation.size.y * image_depth);\n" - " level++;" - " }\n\n" - - " return (offset < level_end);\n" - "}\n\n" - - "uint get_z_index(const in uint x_, const in uint y_, const in uint z_)\n" - "{\n" - " uint offset = 0;\n" - " uint shift = 0;\n" - " uint x = x_;\n" - " uint y = y_;\n" - " uint z = z_;\n" - " uint log2w = invocation.size_log2.x;\n" - " uint log2h = invocation.size_log2.y;\n" - " uint log2d = invocation.size_log2.z;\n" - "\n" - " do\n" - " {\n" - " if (log2w > 0)\n" - " {\n" - " offset |= (x & 1) << shift;\n" - " shift++;\n" - " x >>= 1;\n" - " log2w--;\n" - " }\n" - "\n" - " if (log2h > 0)\n" - " {\n" - " offset |= (y & 1) << shift;\n" - " shift++;\n" - " y >>= 1;\n" - " log2h--;\n" - " }\n" - "\n" - " if (log2d > 0)\n" - " {\n" - " offset |= (z & 1) << shift;\n" - " shift++;\n" - " z >>= 1;\n" - " log2d--;\n" - " }\n" - " }\n" - " while(x > 0 || y > 0 || z > 0);\n" - "\n" - " return offset;\n" - "}\n\n" - - "void main()\n" - "{\n" - " uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);" - " uint texel_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n" - " uint word_count = %_wordcount;\n\n" - - " if (!init_invocation_properties(texel_id))\n" - " return;\n\n" - - " // Calculations done in texels, not bytes\n" - " uint row_length = invocation.size.x;\n" - " uint slice_length = (invocation.size.y * row_length);\n" - " uint level_offset = (texel_id - invocation.data_offset);\n" - " uint slice_offset = (level_offset % slice_length);\n" - " uint z = (level_offset / slice_length);\n" - " uint y = (slice_offset / row_length);\n" - " uint x = (slice_offset % row_length);\n\n" - - " uint src_texel_id = get_z_index(x, y, z);\n" - " uint dst_id = (texel_id * word_count);\n" - " uint src_id = (src_texel_id + invocation.data_offset) * word_count;\n\n" - - " for (uint i = 0; i < word_count; ++i)\n" - " {\n" - " uint value = data_in[src_id++];\n" - " data_out[dst_id++] = %f(value);\n" - " }\n\n" - - "}\n"; + #include "../Program/GLSLSnippets/GPUDeswizzle.glsl" + ; std::string transform; if constexpr (_SwapBytes) @@ -567,6 +444,9 @@ namespace vk const std::pair syntax_replace[] = { + { "%loc", "0" }, + { "%set", "set = 0" }, + { "%push_block", "push_constant" }, { "%ws", std::to_string(optimal_group_size) }, { "%_wordcount", std::to_string(sizeof(_BlockType) / 4) }, { "%f", transform } diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index 35d01d6b92..ac726638a1 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -814,8 +814,10 @@ - - + + + + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index 080583ddcc..b324273da1 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -64,15 +64,18 @@ {652ce43e-72db-42cd-831a-0e194f67e731} - - {bc97b324-1eea-445a-8fa9-6fc49e3df47c} - {7555ff6f-67a9-4d02-b744-0bf896751edb} {d055ca32-157a-4d8c-895e-29509858fcb0} + + {21667779-4136-4de4-8695-9ea13e5c9bce} + + + {bc97b324-1eea-445a-8fa9-6fc49e3df47c} + @@ -2130,11 +2133,17 @@ - - Emu\GPU\RSX\Common\Interpreter + + Emu\GPU\RSX\Program\Snippets - - Emu\GPU\RSX\Common\Interpreter + + Emu\GPU\RSX\Program\Interpreter + + + Emu\GPU\RSX\Program\Interpreter + + + Emu\GPU\RSX\Program\Snippets \ No newline at end of file