From 9fc4face663ae28026ca08f7b101fd126602a94f Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Thu, 4 Mar 2021 14:12:16 +1000 Subject: [PATCH] GPU/HW: Use SSBO+compute shader for VRAM readbacks --- src/common/vulkan/builders.cpp | 42 ++++++++- src/common/vulkan/builders.h | 21 ++++- src/core/gpu_hw_d3d11.cpp | 77 ++++++++++++++++- src/core/gpu_hw_d3d11.h | 12 ++- src/core/gpu_hw_shadergen.cpp | 82 ++++++++++++++++++ src/core/gpu_hw_shadergen.h | 1 + src/core/gpu_hw_vulkan.cpp | 119 +++++++++++++------------- src/core/gpu_hw_vulkan.h | 10 +-- src/core/shadergen.cpp | 29 +++++-- src/core/shadergen.h | 1 + src/frontend-common/fullscreen_ui.cpp | 2 +- 11 files changed, 318 insertions(+), 78 deletions(-) diff --git a/src/common/vulkan/builders.cpp b/src/common/vulkan/builders.cpp index 13cf33597..46634ea1b 100644 --- a/src/common/vulkan/builders.cpp +++ b/src/common/vulkan/builders.cpp @@ -402,6 +402,46 @@ void GraphicsPipelineBuilder::SetRenderPass(VkRenderPass render_pass, u32 subpas m_ci.subpass = subpass; } +ComputePipelineBuilder::ComputePipelineBuilder() +{ + Clear(); +} + +void ComputePipelineBuilder::Clear() +{ + m_ci = {}; + m_ci.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; +} + +VkPipeline ComputePipelineBuilder::Create(VkDevice device, VkPipelineCache pipeline_cache, bool clear /* = true */) +{ + VkPipeline pipeline; + VkResult res = vkCreateComputePipelines(device, pipeline_cache, 1, &m_ci, nullptr, &pipeline); + if (res != VK_SUCCESS) + { + LOG_VULKAN_ERROR(res, "vkCreateComputePipelines() failed: "); + return VK_NULL_HANDLE; + } + + if (clear) + Clear(); + + return pipeline; +} + +void ComputePipelineBuilder::SetShader(VkShaderModule module, const char* entry_point) +{ + m_ci.stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + m_ci.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT; + m_ci.stage.module = module; + m_ci.stage.pName = entry_point; +} + +void ComputePipelineBuilder::SetPipelineLayout(VkPipelineLayout layout) +{ + m_ci.layout = layout; +} + SamplerBuilder::SamplerBuilder() { Clear(); @@ -542,7 +582,7 @@ void DescriptorSetUpdateBuilder::AddCombinedImageSamplerDescriptorWrite( } void DescriptorSetUpdateBuilder::AddBufferDescriptorWrite(VkDescriptorSet set, u32 binding, VkDescriptorType dtype, - VkBuffer buffer, u32 offset, u32 size) + VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size) { Assert(m_num_writes < MAX_WRITES && m_num_infos < MAX_INFOS); diff --git a/src/common/vulkan/builders.h b/src/common/vulkan/builders.h index 217e3e7a8..d205ace4f 100644 --- a/src/common/vulkan/builders.h +++ b/src/common/vulkan/builders.h @@ -138,6 +138,23 @@ private: VkPipelineMultisampleStateCreateInfo m_multisample_state; }; +class ComputePipelineBuilder +{ +public: + ComputePipelineBuilder(); + + void Clear(); + + VkPipeline Create(VkDevice device, VkPipelineCache pipeline_cache = VK_NULL_HANDLE, bool clear = true); + + void SetShader(VkShaderModule module, const char* entry_point); + + void SetPipelineLayout(VkPipelineLayout layout); + +private: + VkComputePipelineCreateInfo m_ci; +}; + class SamplerBuilder { public: @@ -177,8 +194,8 @@ public: void AddSamplerDescriptorWrite(VkDescriptorSet set, u32 binding, VkSampler sampler); void AddCombinedImageSamplerDescriptorWrite(VkDescriptorSet set, u32 binding, VkImageView view, VkSampler sampler, VkImageLayout layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - void AddBufferDescriptorWrite(VkDescriptorSet set, u32 binding, VkDescriptorType dtype, VkBuffer buffer, u32 offset, - u32 size); + void AddBufferDescriptorWrite(VkDescriptorSet set, u32 binding, VkDescriptorType dtype, VkBuffer buffer, + VkDeviceSize offset, VkDeviceSize size); void AddBufferViewDescriptorWrite(VkDescriptorSet set, u32 binding, VkDescriptorType dtype, VkBufferView view); private: diff --git a/src/core/gpu_hw_d3d11.cpp b/src/core/gpu_hw_d3d11.cpp index 11b398623..ad4edc5df 100644 --- a/src/core/gpu_hw_d3d11.cpp +++ b/src/core/gpu_hw_d3d11.cpp @@ -1,4 +1,5 @@ #include "gpu_hw_d3d11.h" +#include "common/align.h" #include "common/assert.h" #include "common/d3d11/shader_compiler.h" #include "common/log.h" @@ -363,14 +364,37 @@ bool GPU_HW_D3D11::CreateTextureBuffer() const CD3D11_SHADER_RESOURCE_VIEW_DESC srv_desc(D3D11_SRV_DIMENSION_BUFFER, DXGI_FORMAT_R16_UINT, 0, VRAM_UPDATE_TEXTURE_BUFFER_SIZE / sizeof(u16)); - const HRESULT hr = m_device->CreateShaderResourceView(m_texture_stream_buffer.GetD3DBuffer(), &srv_desc, - m_texture_stream_buffer_srv_r16ui.ReleaseAndGetAddressOf()); + HRESULT hr = m_device->CreateShaderResourceView(m_texture_stream_buffer.GetD3DBuffer(), &srv_desc, + m_texture_stream_buffer_srv_r16ui.ReleaseAndGetAddressOf()); if (FAILED(hr)) { Log_ErrorPrintf("Creation of texture buffer SRV failed: 0x%08X", hr); return false; } + const u32 buffer_elements = (VRAM_WIDTH / 2) * VRAM_HEIGHT; + const CD3D11_BUFFER_DESC read_buffer_desc(buffer_elements * sizeof(u32), D3D11_BIND_UNORDERED_ACCESS, + D3D11_USAGE_DEFAULT, 0, 0, sizeof(u32)); + const CD3D11_BUFFER_DESC staging_buffer_desc(buffer_elements * sizeof(u32), 0, D3D11_USAGE_STAGING, + D3D11_CPU_ACCESS_READ, 0, 0); + const CD3D11_UNORDERED_ACCESS_VIEW_DESC uav_desc(D3D11_UAV_DIMENSION_BUFFER, DXGI_FORMAT_R32_UINT, 0, buffer_elements, + 0); + hr = m_device->CreateBuffer(&read_buffer_desc, nullptr, m_vram_read_buffer.ReleaseAndGetAddressOf()); + if (SUCCEEDED(hr)) + { + hr = m_device->CreateBuffer(&staging_buffer_desc, nullptr, m_vram_read_staging_buffer.ReleaseAndGetAddressOf()); + if (SUCCEEDED(hr)) + { + hr = m_device->CreateUnorderedAccessView(m_vram_read_buffer.Get(), &uav_desc, + m_vram_read_buffer_view.ReleaseAndGetAddressOf()); + } + } + if (FAILED(hr)) + { + Log_ErrorPrintf("Creation of buffer/UAV failed: 0x%08X", hr); + return false; + } + return true; } @@ -612,6 +636,10 @@ bool GPU_HW_D3D11::CompileShaders() if (!m_vram_read_pixel_shader) return false; + m_vram_read_compute_shader = shader_cache.GetComputeShader(m_device.Get(), shadergen.GenerateVRAMReadComputeShader()); + if (!m_vram_read_compute_shader) + return false; + UPDATE_PROGRESS(); m_vram_write_pixel_shader = @@ -946,6 +974,7 @@ void GPU_HW_D3D11::ReadVRAM(u32 x, u32 y, u32 width, u32 height) const u32 encoded_width = (copy_rect.GetWidth() + 1) / 2; const u32 encoded_height = copy_rect.GetHeight(); +#if 0 // Encode the 24-bit texture as 16-bit. const u32 uniforms[4] = {copy_rect.left, copy_rect.top, copy_rect.GetWidth(), copy_rect.GetHeight()}; m_context->RSSetState(m_cull_none_rasterizer_state_no_msaa.Get()); @@ -971,6 +1000,50 @@ void GPU_HW_D3D11::ReadVRAM(u32 x, u32 y, u32 width, u32 height) } RestoreGraphicsAPIState(); +#else + // Encode the 24-bit texture as 16-bit. + const u32 uniforms[5] = {copy_rect.left, copy_rect.top, copy_rect.GetWidth(), copy_rect.GetHeight(), encoded_width}; + const auto res = m_uniform_stream_buffer.Map(m_context.Get(), MAX_UNIFORM_BUFFER_SIZE, sizeof(uniforms)); + std::memcpy(res.pointer, uniforms, sizeof(uniforms)); + m_uniform_stream_buffer.Unmap(m_context.Get(), sizeof(uniforms)); + m_context->CSSetConstantBuffers(0, 1, m_uniform_stream_buffer.GetD3DBufferArray()); + + m_context->OMSetRenderTargets(0, nullptr, nullptr); + m_context->CSSetUnorderedAccessViews(0, 1, m_vram_read_buffer_view.GetAddressOf(), nullptr); + m_context->CSSetShaderResources(0, 1, m_vram_texture.GetD3DSRVArray()); + m_context->CSSetShader(m_vram_read_compute_shader.Get(), nullptr, 0); + + const u32 groups_x = (encoded_width + 7) / 8; + const u32 groups_y = (encoded_height + 7) / 8; + m_context->Dispatch(groups_x, groups_y, 1); + + ID3D11ShaderResourceView* null_view[1] = {nullptr}; + m_context->CSSetShaderResources(0, 1, null_view); + m_context->OMSetRenderTargets(1, m_vram_texture.GetD3DRTVArray(), m_vram_depth_view.Get()); + + const CD3D11_BOX copy_box(0, 0, 0, static_cast(encoded_width * encoded_height * sizeof(u32)), 1, 1); + m_context->CopySubresourceRegion(m_vram_read_staging_buffer.Get(), 0, 0, 0, 0, m_vram_read_buffer.Get(), 0, + ©_box); + + D3D11_MAPPED_SUBRESOURCE msr; + HRESULT hr = m_context->Map(m_vram_read_staging_buffer.Get(), 0, D3D11_MAP_READ, 0, &msr); + if (FAILED(hr)) + { + Log_ErrorPrintf("Failed to map VRAM readback buffer"); + return; + } + + u16* dst_ptr = &m_vram_shadow[copy_rect.top * VRAM_WIDTH + copy_rect.left]; + const u8* src_ptr = static_cast(msr.pData); + for (u32 row = 0; row < encoded_height; row++) + { + std::memcpy(dst_ptr, src_ptr, sizeof(u32) * encoded_width); + src_ptr += sizeof(u32) * encoded_width; + dst_ptr += VRAM_WIDTH; + } + + m_context->Unmap(m_vram_read_staging_buffer.Get(), 0); +#endif } void GPU_HW_D3D11::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) diff --git a/src/core/gpu_hw_d3d11.h b/src/core/gpu_hw_d3d11.h index fab7a7e7f..88785ea31 100644 --- a/src/core/gpu_hw_d3d11.h +++ b/src/core/gpu_hw_d3d11.h @@ -84,7 +84,6 @@ private: D3D11::Texture m_vram_depth_texture; ComPtr m_vram_depth_view; D3D11::Texture m_vram_read_texture; - D3D11::Texture m_vram_encoding_texture; D3D11::Texture m_display_texture; D3D11::StreamBuffer m_vertex_stream_buffer; @@ -93,7 +92,9 @@ private: D3D11::StreamBuffer m_texture_stream_buffer; - D3D11::StagingTexture m_vram_readback_texture; + ComPtr m_vram_read_buffer; + ComPtr m_vram_read_staging_buffer; + ComPtr m_vram_read_buffer_view; ComPtr m_texture_stream_buffer_srv_r16ui; @@ -123,7 +124,7 @@ private: ComPtr m_copy_pixel_shader; ComPtr m_vram_fill_pixel_shader; ComPtr m_vram_interlaced_fill_pixel_shader; - ComPtr m_vram_read_pixel_shader; + ComPtr m_vram_read_compute_shader; ComPtr m_vram_write_pixel_shader; ComPtr m_vram_copy_pixel_shader; ComPtr m_vram_update_depth_pixel_shader; @@ -139,4 +140,9 @@ private: D3D11::Texture m_downsample_texture; D3D11::Texture m_downsample_weight_texture; std::vector, ComPtr>> m_downsample_mip_views; + + // fallback vram read + D3D11::Texture m_vram_encoding_texture; + D3D11::StagingTexture m_vram_readback_texture; + ComPtr m_vram_read_pixel_shader; }; diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp index 0420d643f..42464b65f 100644 --- a/src/core/gpu_hw_shadergen.cpp +++ b/src/core/gpu_hw_shadergen.cpp @@ -1192,6 +1192,88 @@ uint SampleVRAM(uint2 coords) return ss.str(); } +std::string GPU_HW_ShaderGen::GenerateVRAMReadComputeShader() +{ + std::stringstream ss; + WriteHeader(ss); + WriteCommonFunctions(ss); + DeclareUniformBuffer(ss, {"uint2 u_base_coords", "uint2 u_size", "uint u_buffer_stride"}, true); + + DeclareTexture(ss, "samp0", 0, UsingMSAA()); + + if (m_glsl) + { + ss << "layout(std430"; + if (IsVulkan()) + ss << ", set = 0, binding = 2"; + else if (m_use_glsl_binding_layout) + ss << ", binding = 1"; + + ss << ") restrict writeonly buffer SSBO {\n"; + ss << " uint s_output_buffer[];\n"; + ss << "};\n"; + } + else + { + ss << "RWBuffer s_output_buffer : register(u0);\n"; + } + + ss << R"( +float4 LoadVRAM(int2 coords) +{ +#if MULTISAMPLING + float4 value = LOAD_TEXTURE_MS(samp0, coords, 0u); + for (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++) + value += LOAD_TEXTURE_MS(samp0, coords, sample_index); + value /= float(MULTISAMPLES); + return value; +#else + return LOAD_TEXTURE(samp0, coords, 0); +#endif +} + +uint SampleVRAM(uint2 coords) +{ + if (RESOLUTION_SCALE == 1u) + return RGBA8ToRGBA5551(LoadVRAM(int2(coords))); + + // Box filter for downsampling. + float4 value = float4(0.0, 0.0, 0.0, 0.0); + uint2 base_coords = coords * uint2(RESOLUTION_SCALE, RESOLUTION_SCALE); + for (uint offset_x = 0u; offset_x < RESOLUTION_SCALE; offset_x++) + { + for (uint offset_y = 0u; offset_y < RESOLUTION_SCALE; offset_y++) + value += LoadVRAM(int2(base_coords + uint2(offset_x, offset_y))); + } + value /= float(RESOLUTION_SCALE * RESOLUTION_SCALE); + return RGBA8ToRGBA5551(value); +} +)"; + + DeclareComputeEntryPoint(ss, 8, 8, 1); + ss << R"( +{ + uint2 sample_coords = uint2(uint(c_global_id.x) * 2u, uint(c_global_id.y)); + + #if API_OPENGL || API_OPENGL_ES + // Lower-left origin flip for OpenGL. + // We want to write the image out upside-down so we can read it top-to-bottom. + sample_coords.y = u_size.y - sample_coords.y - 1u; + #endif + + sample_coords += u_base_coords; + + // We're encoding as 32-bit, so the output width is halved and we pack two 16-bit pixels in one 32-bit pixel. + uint left = SampleVRAM(sample_coords); + uint right = SampleVRAM(uint2(sample_coords.x + 1u, sample_coords.y)); + + uint buffer_offset = c_global_id.y * u_buffer_stride + c_global_id.x; + s_output_buffer[buffer_offset] = left | (right << 16); +})"; + + return ss.str(); +} + std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(bool use_ssbo) { std::stringstream ss; diff --git a/src/core/gpu_hw_shadergen.h b/src/core/gpu_hw_shadergen.h index dff617e97..2cb3537cc 100644 --- a/src/core/gpu_hw_shadergen.h +++ b/src/core/gpu_hw_shadergen.h @@ -17,6 +17,7 @@ public: std::string GenerateDisplayFragmentShader(bool depth_24bit, GPU_HW::InterlacedRenderMode interlace_mode, bool smooth_chroma); std::string GenerateVRAMReadFragmentShader(); + std::string GenerateVRAMReadComputeShader(); std::string GenerateVRAMWriteFragmentShader(bool use_ssbo); std::string GenerateVRAMCopyFragmentShader(); std::string GenerateVRAMUpdateDepthFragmentShader(); diff --git a/src/core/gpu_hw_vulkan.cpp b/src/core/gpu_hw_vulkan.cpp index abefcc9f7..7aad4e53b 100644 --- a/src/core/gpu_hw_vulkan.cpp +++ b/src/core/gpu_hw_vulkan.cpp @@ -360,10 +360,12 @@ void GPU_HW_Vulkan::DestroyResources() m_texture_stream_buffer.Destroy(false); Vulkan::Util::SafeDestroyPipelineLayout(m_vram_write_pipeline_layout); + Vulkan::Util::SafeDestroyPipelineLayout(m_vram_read_pipeline_layout); Vulkan::Util::SafeDestroyPipelineLayout(m_single_sampler_pipeline_layout); Vulkan::Util::SafeDestroyPipelineLayout(m_no_samplers_pipeline_layout); Vulkan::Util::SafeDestroyPipelineLayout(m_batch_pipeline_layout); Vulkan::Util::SafeDestroyDescriptorSetLayout(m_vram_write_descriptor_set_layout); + Vulkan::Util::SafeDestroyDescriptorSetLayout(m_vram_read_descriptor_set_layout); Vulkan::Util::SafeDestroyDescriptorSetLayout(m_single_sampler_descriptor_set_layout); Vulkan::Util::SafeDestroyDescriptorSetLayout(m_batch_descriptor_set_layout); Vulkan::Util::SafeDestroySampler(m_point_sampler); @@ -431,6 +433,12 @@ bool GPU_HW_Vulkan::CreatePipelineLayouts() if (m_vram_write_descriptor_set_layout == VK_NULL_HANDLE) return false; + dslbuilder.AddBinding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_COMPUTE_BIT); + dslbuilder.AddBinding(2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT); + m_vram_read_descriptor_set_layout = dslbuilder.Create(device); + if (m_vram_read_descriptor_set_layout == VK_NULL_HANDLE) + return false; + Vulkan::PipelineLayoutBuilder plbuilder; plbuilder.AddDescriptorSet(m_batch_descriptor_set_layout); m_batch_pipeline_layout = plbuilder.Create(device); @@ -448,6 +456,12 @@ bool GPU_HW_Vulkan::CreatePipelineLayouts() if (m_no_samplers_pipeline_layout == VK_NULL_HANDLE) return false; + plbuilder.AddDescriptorSet(m_vram_read_descriptor_set_layout); + plbuilder.AddPushConstants(VK_SHADER_STAGE_COMPUTE_BIT, 0, MAX_PUSH_CONSTANTS_SIZE); + m_vram_read_pipeline_layout = plbuilder.Create(device); + if (m_vram_read_pipeline_layout == VK_NULL_HANDLE) + return false; + plbuilder.AddDescriptorSet(m_vram_write_descriptor_set_layout); plbuilder.AddPushConstants(VK_SHADER_STAGE_FRAGMENT_BIT, 0, MAX_PUSH_CONSTANTS_SIZE); m_vram_write_pipeline_layout = plbuilder.Create(device); @@ -512,6 +526,7 @@ bool GPU_HW_Vulkan::CreateFramebuffer() const VkFormat texture_format = VK_FORMAT_R8G8B8A8_UNORM; const VkFormat depth_format = VK_FORMAT_D16_UNORM; const VkSampleCountFlagBits samples = static_cast(m_multisamples); + const u32 read_staging_buffer_size = (VRAM_WIDTH / 2) * VRAM_HEIGHT * sizeof(u32); if (!m_vram_texture.Create(texture_width, texture_height, 1, 1, texture_format, samples, VK_IMAGE_VIEW_TYPE_2D, VK_IMAGE_TILING_OPTIMAL, @@ -529,11 +544,9 @@ bool GPU_HW_Vulkan::CreateFramebuffer() VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT) || - !m_vram_readback_texture.Create(VRAM_WIDTH, VRAM_HEIGHT, 1, 1, texture_format, VK_SAMPLE_COUNT_1_BIT, - VK_IMAGE_VIEW_TYPE_2D, VK_IMAGE_TILING_OPTIMAL, - VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT) || - !m_vram_readback_staging_texture.Create(Vulkan::StagingBuffer::Type::Readback, texture_format, VRAM_WIDTH / 2, - VRAM_HEIGHT)) + !m_vram_read_staging_buffer.Create(Vulkan::StagingBuffer::Type::Readback, read_staging_buffer_size, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) || + !m_vram_read_staging_buffer.Map()) { return false; } @@ -544,12 +557,9 @@ bool GPU_HW_Vulkan::CreateFramebuffer() g_vulkan_context->GetRenderPass(VK_FORMAT_UNDEFINED, depth_format, samples, VK_ATTACHMENT_LOAD_OP_DONT_CARE); m_display_render_pass = g_vulkan_context->GetRenderPass(m_display_texture.GetFormat(), VK_FORMAT_UNDEFINED, m_display_texture.GetSamples(), VK_ATTACHMENT_LOAD_OP_LOAD); - m_vram_readback_render_pass = - g_vulkan_context->GetRenderPass(m_vram_readback_texture.GetFormat(), VK_FORMAT_UNDEFINED, - m_vram_readback_texture.GetSamples(), VK_ATTACHMENT_LOAD_OP_DONT_CARE); if (m_vram_render_pass == VK_NULL_HANDLE || m_vram_update_depth_render_pass == VK_NULL_HANDLE || - m_display_render_pass == VK_NULL_HANDLE || m_vram_readback_render_pass == VK_NULL_HANDLE) + m_display_render_pass == VK_NULL_HANDLE) { return false; } @@ -565,13 +575,9 @@ bool GPU_HW_Vulkan::CreateFramebuffer() return false; m_vram_update_depth_framebuffer = m_vram_depth_texture.CreateFramebuffer(m_vram_update_depth_render_pass); - m_vram_readback_framebuffer = m_vram_readback_texture.CreateFramebuffer(m_vram_readback_render_pass); m_display_framebuffer = m_display_texture.CreateFramebuffer(m_display_render_pass); - if (m_vram_update_depth_framebuffer == VK_NULL_HANDLE || m_vram_readback_framebuffer == VK_NULL_HANDLE || - m_display_framebuffer == VK_NULL_HANDLE) - { + if (m_vram_update_depth_framebuffer == VK_NULL_HANDLE || m_display_framebuffer == VK_NULL_HANDLE) return false; - } VkCommandBuffer cmdbuf = g_vulkan_context->GetCurrentCommandBuffer(); m_vram_texture.TransitionToLayout(cmdbuf, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); @@ -582,10 +588,13 @@ bool GPU_HW_Vulkan::CreateFramebuffer() m_batch_descriptor_set = g_vulkan_context->AllocateGlobalDescriptorSet(m_batch_descriptor_set_layout); m_vram_copy_descriptor_set = g_vulkan_context->AllocateGlobalDescriptorSet(m_single_sampler_descriptor_set_layout); - m_vram_read_descriptor_set = g_vulkan_context->AllocateGlobalDescriptorSet(m_single_sampler_descriptor_set_layout); + m_vram_read_descriptor_set = g_vulkan_context->AllocateGlobalDescriptorSet(m_vram_read_descriptor_set_layout); + m_vram_update_depth_descriptor_set = + g_vulkan_context->AllocateGlobalDescriptorSet(m_single_sampler_descriptor_set_layout); m_display_descriptor_set = g_vulkan_context->AllocateGlobalDescriptorSet(m_single_sampler_descriptor_set_layout); if (m_batch_descriptor_set == VK_NULL_HANDLE || m_vram_copy_descriptor_set == VK_NULL_HANDLE || - m_vram_read_descriptor_set == VK_NULL_HANDLE || m_display_descriptor_set == VK_NULL_HANDLE) + m_vram_read_descriptor_set == VK_NULL_HANDLE || m_vram_update_depth_descriptor_set == VK_NULL_HANDLE || + m_display_descriptor_set == VK_NULL_HANDLE) { return false; } @@ -598,6 +607,10 @@ bool GPU_HW_Vulkan::CreateFramebuffer() m_point_sampler, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); dsubuilder.AddCombinedImageSamplerDescriptorWrite(m_vram_read_descriptor_set, 1, m_vram_texture.GetView(), m_point_sampler, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + dsubuilder.AddBufferDescriptorWrite(m_vram_read_descriptor_set, 2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + m_vram_read_staging_buffer.GetBuffer(), 0, m_vram_read_staging_buffer.GetSize()); + dsubuilder.AddCombinedImageSamplerDescriptorWrite(m_vram_update_depth_descriptor_set, 1, m_vram_texture.GetView(), + m_point_sampler, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); dsubuilder.AddCombinedImageSamplerDescriptorWrite(m_display_descriptor_set, 1, m_display_texture.GetView(), m_point_sampler, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); dsubuilder.Update(g_vulkan_context->GetDevice()); @@ -743,21 +756,20 @@ void GPU_HW_Vulkan::DestroyFramebuffer() m_downsample_weight_texture.Destroy(false); Vulkan::Util::SafeFreeGlobalDescriptorSet(m_batch_descriptor_set); + Vulkan::Util::SafeFreeGlobalDescriptorSet(m_vram_update_depth_descriptor_set); Vulkan::Util::SafeFreeGlobalDescriptorSet(m_vram_copy_descriptor_set); Vulkan::Util::SafeFreeGlobalDescriptorSet(m_vram_read_descriptor_set); Vulkan::Util::SafeFreeGlobalDescriptorSet(m_display_descriptor_set); Vulkan::Util::SafeDestroyFramebuffer(m_vram_framebuffer); Vulkan::Util::SafeDestroyFramebuffer(m_vram_update_depth_framebuffer); - Vulkan::Util::SafeDestroyFramebuffer(m_vram_readback_framebuffer); Vulkan::Util::SafeDestroyFramebuffer(m_display_framebuffer); m_vram_read_texture.Destroy(false); m_vram_depth_texture.Destroy(false); m_vram_texture.Destroy(false); - m_vram_readback_texture.Destroy(false); m_display_texture.Destroy(false); - m_vram_readback_staging_texture.Destroy(false); + m_vram_read_staging_buffer.Destroy(false); } bool GPU_HW_Vulkan::CreateVertexBuffer() @@ -883,6 +895,7 @@ bool GPU_HW_Vulkan::CompilePipelines() } Vulkan::GraphicsPipelineBuilder gpbuilder; + Vulkan::ComputePipelineBuilder csbuilder; // [depth_test][render_mode][texture_mode][transparency_mode][dithering][interlacing] for (u8 depth_test = 0; depth_test < 3; depth_test++) @@ -1104,22 +1117,16 @@ bool GPU_HW_Vulkan::CompilePipelines() // VRAM read { - VkShaderModule fs = g_vulkan_shader_cache->GetFragmentShader(shadergen.GenerateVRAMReadFragmentShader()); - if (fs == VK_NULL_HANDLE) + VkShaderModule cs = g_vulkan_shader_cache->GetComputeShader(shadergen.GenerateVRAMReadComputeShader()); + if (cs == VK_NULL_HANDLE) return false; - gpbuilder.SetRenderPass(m_vram_readback_render_pass, 0); - gpbuilder.SetPipelineLayout(m_single_sampler_pipeline_layout); - gpbuilder.SetVertexShader(fullscreen_quad_vertex_shader); - gpbuilder.SetFragmentShader(fs); - gpbuilder.SetNoCullRasterizationState(); - gpbuilder.SetNoDepthTestState(); - gpbuilder.SetNoBlendingState(); - gpbuilder.SetDynamicViewportAndScissorState(); + csbuilder.SetPipelineLayout(m_vram_read_pipeline_layout); + csbuilder.SetShader(cs, "main"); - m_vram_readback_pipeline = gpbuilder.Create(device, pipeline_cache, false); - vkDestroyShaderModule(device, fs, nullptr); - if (m_vram_readback_pipeline == VK_NULL_HANDLE) + m_vram_read_pipeline = csbuilder.Create(device, pipeline_cache, false); + vkDestroyShaderModule(device, cs, nullptr); + if (m_vram_read_pipeline == VK_NULL_HANDLE) return false; UPDATE_PROGRESS(); @@ -1257,7 +1264,7 @@ void GPU_HW_Vulkan::DestroyPipelines() for (VkPipeline& p : m_vram_copy_pipelines) Vulkan::Util::SafeDestroyPipeline(p); - Vulkan::Util::SafeDestroyPipeline(m_vram_readback_pipeline); + Vulkan::Util::SafeDestroyPipeline(m_vram_read_pipeline); Vulkan::Util::SafeDestroyPipeline(m_vram_update_depth_pipeline); Vulkan::Util::SafeDestroyPipeline(m_downsample_first_pass_pipeline); @@ -1427,41 +1434,37 @@ void GPU_HW_Vulkan::ReadVRAM(u32 x, u32 y, u32 width, u32 height) const Common::Rectangle copy_rect = GetVRAMTransferBounds(x, y, width, height); const u32 encoded_width = (copy_rect.GetWidth() + 1) / 2; const u32 encoded_height = copy_rect.GetHeight(); + const u32 encoded_size = encoded_width * encoded_height * sizeof(u32); EndRenderPass(); VkCommandBuffer cmdbuf = g_vulkan_context->GetCurrentCommandBuffer(); m_vram_texture.TransitionToLayout(cmdbuf, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - m_vram_readback_texture.TransitionToLayout(cmdbuf, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); - // Work around Mali driver bug: set full framebuffer size for render area. The GPU crashes with a page fault if we use - // the actual size we're rendering to... - BeginRenderPass(m_vram_readback_render_pass, m_vram_readback_framebuffer, 0, 0, m_vram_readback_texture.GetWidth(), - m_vram_readback_texture.GetHeight()); - - // Encode the 24-bit texture as 16-bit. - const u32 uniforms[4] = {copy_rect.left, copy_rect.top, copy_rect.GetWidth(), copy_rect.GetHeight()}; - vkCmdBindPipeline(cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, m_vram_readback_pipeline); - vkCmdPushConstants(cmdbuf, m_single_sampler_pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT, 0, sizeof(uniforms), - uniforms); - vkCmdBindDescriptorSets(cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, m_single_sampler_pipeline_layout, 0, 1, + const u32 uniforms[5] = {copy_rect.left, copy_rect.top, copy_rect.GetWidth(), copy_rect.GetHeight(), encoded_width}; + vkCmdBindPipeline(cmdbuf, VK_PIPELINE_BIND_POINT_COMPUTE, m_vram_read_pipeline); + vkCmdPushConstants(cmdbuf, m_vram_read_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(uniforms), uniforms); + vkCmdBindDescriptorSets(cmdbuf, VK_PIPELINE_BIND_POINT_COMPUTE, m_vram_read_pipeline_layout, 0, 1, &m_vram_read_descriptor_set, 0, nullptr); - Vulkan::Util::SetViewportAndScissor(cmdbuf, 0, 0, encoded_width, encoded_height); - vkCmdDraw(cmdbuf, 3, 1, 0, 0); - EndRenderPass(); + const u32 groups_x = (encoded_width + 7) / 8; + const u32 groups_y = (encoded_height + 7) / 8; + vkCmdDispatch(cmdbuf, groups_x, groups_y, 1); - m_vram_readback_texture.TransitionToLayout(cmdbuf, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); m_vram_texture.TransitionToLayout(cmdbuf, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); + m_vram_read_staging_buffer.FlushGPUCache(cmdbuf, VK_ACCESS_SHADER_WRITE_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, + encoded_size); + g_vulkan_context->ExecuteCommandBuffer(true); + m_vram_read_staging_buffer.InvalidateCPUCache(0, encoded_size); - // Stage the readback. - m_vram_readback_staging_texture.CopyFromTexture(m_vram_readback_texture, 0, 0, 0, 0, 0, 0, encoded_width, - encoded_height); - - // And copy it into our shadow buffer (will execute command buffer and stall). - m_vram_readback_staging_texture.ReadTexels(0, 0, encoded_width, encoded_height, - &m_vram_shadow[copy_rect.top * VRAM_WIDTH + copy_rect.left], - VRAM_WIDTH * sizeof(u16)); + u16* dst_ptr = &m_vram_shadow[copy_rect.top * VRAM_WIDTH + copy_rect.left]; + const char* src_ptr = static_cast(m_vram_read_staging_buffer.GetMapPointer()); + for (u32 row = 0; row < encoded_height; row++) + { + std::memcpy(dst_ptr, src_ptr, sizeof(u32) * encoded_width); + src_ptr += sizeof(u32) * encoded_width; + dst_ptr += VRAM_WIDTH; + } RestoreGraphicsAPIState(); } @@ -1667,7 +1670,7 @@ void GPU_HW_Vulkan::UpdateDepthBufferFromMaskBit() vkCmdBindPipeline(cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, m_vram_update_depth_pipeline); vkCmdBindDescriptorSets(cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, m_single_sampler_pipeline_layout, 0, 1, - &m_vram_read_descriptor_set, 0, nullptr); + &m_vram_update_depth_descriptor_set, 0, nullptr); Vulkan::Util::SetViewportAndScissor(cmdbuf, 0, 0, m_vram_texture.GetWidth(), m_vram_texture.GetHeight()); vkCmdDraw(cmdbuf, 3, 1, 0, 0); diff --git a/src/core/gpu_hw_vulkan.h b/src/core/gpu_hw_vulkan.h index a09bad3e6..79bfd5bdd 100644 --- a/src/core/gpu_hw_vulkan.h +++ b/src/core/gpu_hw_vulkan.h @@ -81,28 +81,27 @@ private: VkRenderPass m_vram_render_pass = VK_NULL_HANDLE; VkRenderPass m_vram_update_depth_render_pass = VK_NULL_HANDLE; VkRenderPass m_display_render_pass = VK_NULL_HANDLE; - VkRenderPass m_vram_readback_render_pass = VK_NULL_HANDLE; VkDescriptorSetLayout m_batch_descriptor_set_layout = VK_NULL_HANDLE; VkDescriptorSetLayout m_single_sampler_descriptor_set_layout = VK_NULL_HANDLE; + VkDescriptorSetLayout m_vram_read_descriptor_set_layout = VK_NULL_HANDLE; VkDescriptorSetLayout m_vram_write_descriptor_set_layout = VK_NULL_HANDLE; VkPipelineLayout m_batch_pipeline_layout = VK_NULL_HANDLE; VkPipelineLayout m_no_samplers_pipeline_layout = VK_NULL_HANDLE; VkPipelineLayout m_single_sampler_pipeline_layout = VK_NULL_HANDLE; + VkPipelineLayout m_vram_read_pipeline_layout = VK_NULL_HANDLE; VkPipelineLayout m_vram_write_pipeline_layout = VK_NULL_HANDLE; Vulkan::Texture m_vram_texture; Vulkan::Texture m_vram_depth_texture; Vulkan::Texture m_vram_read_texture; - Vulkan::Texture m_vram_readback_texture; - Vulkan::StagingTexture m_vram_readback_staging_texture; + Vulkan::StagingBuffer m_vram_read_staging_buffer; Vulkan::Texture m_display_texture; bool m_use_ssbos_for_vram_writes = false; VkFramebuffer m_vram_framebuffer = VK_NULL_HANDLE; VkFramebuffer m_vram_update_depth_framebuffer = VK_NULL_HANDLE; - VkFramebuffer m_vram_readback_framebuffer = VK_NULL_HANDLE; VkFramebuffer m_display_framebuffer = VK_NULL_HANDLE; VkSampler m_point_sampler = VK_NULL_HANDLE; @@ -113,6 +112,7 @@ private: VkDescriptorSet m_vram_copy_descriptor_set = VK_NULL_HANDLE; VkDescriptorSet m_vram_read_descriptor_set = VK_NULL_HANDLE; VkDescriptorSet m_vram_write_descriptor_set = VK_NULL_HANDLE; + VkDescriptorSet m_vram_update_depth_descriptor_set = VK_NULL_HANDLE; VkDescriptorSet m_display_descriptor_set = VK_NULL_HANDLE; Vulkan::StreamBuffer m_vertex_stream_buffer; @@ -132,7 +132,7 @@ private: std::array m_vram_write_pipelines{}; std::array m_vram_copy_pipelines{}; - VkPipeline m_vram_readback_pipeline = VK_NULL_HANDLE; + VkPipeline m_vram_read_pipeline = VK_NULL_HANDLE; VkPipeline m_vram_update_depth_pipeline = VK_NULL_HANDLE; // [depth_24][interlace_mode] diff --git a/src/core/shadergen.cpp b/src/core/shadergen.cpp index 641126773..f236afe19 100644 --- a/src/core/shadergen.cpp +++ b/src/core/shadergen.cpp @@ -340,7 +340,7 @@ void ShaderGen::DeclareVertexEntryPoint( for (u32 i = 0; i < num_texcoord_outputs; i++) ss << " " << qualifier << "float2 v_tex" << i << ";\n"; - for (const auto &[qualifiers, name] : additional_outputs) + for (const auto& [qualifiers, name] : additional_outputs) { const char* qualifier_to_use = (std::strlen(qualifiers) > 0) ? qualifiers : qualifier; ss << " " << qualifier_to_use << " " << name << ";\n"; @@ -357,7 +357,7 @@ void ShaderGen::DeclareVertexEntryPoint( for (u32 i = 0; i < num_texcoord_outputs; i++) ss << qualifier << "out float2 v_tex" << i << ";\n"; - for (const auto &[qualifiers, name] : additional_outputs) + for (const auto& [qualifiers, name] : additional_outputs) { const char* qualifier_to_use = (std::strlen(qualifiers) > 0) ? qualifiers : qualifier; ss << qualifier_to_use << " out " << name << ";\n"; @@ -399,7 +399,7 @@ void ShaderGen::DeclareVertexEntryPoint( ss << " " << qualifier << "out float2 v_tex" << i << " : TEXCOORD" << i << ",\n"; u32 additional_counter = num_texcoord_outputs; - for (const auto &[qualifiers, name] : additional_outputs) + for (const auto& [qualifiers, name] : additional_outputs) { const char* qualifier_to_use = (std::strlen(qualifiers) > 0) ? qualifiers : qualifier; ss << " " << qualifier_to_use << " out " << name << " : TEXCOORD" << additional_counter << ",\n"; @@ -433,7 +433,7 @@ void ShaderGen::DeclareFragmentEntryPoint( for (u32 i = 0; i < num_texcoord_inputs; i++) ss << " " << qualifier << "float2 v_tex" << i << ";\n"; - for (const auto &[qualifiers, name] : additional_inputs) + for (const auto& [qualifiers, name] : additional_inputs) { const char* qualifier_to_use = (std::strlen(qualifiers) > 0) ? qualifiers : qualifier; ss << " " << qualifier_to_use << " " << name << ";\n"; @@ -450,7 +450,7 @@ void ShaderGen::DeclareFragmentEntryPoint( for (u32 i = 0; i < num_texcoord_inputs; i++) ss << qualifier << "in float2 v_tex" << i << ";\n"; - for (const auto &[qualifiers, name] : additional_inputs) + for (const auto& [qualifiers, name] : additional_inputs) { const char* qualifier_to_use = (std::strlen(qualifiers) > 0) ? qualifiers : qualifier; ss << qualifier_to_use << " in " << name << ";\n"; @@ -503,7 +503,7 @@ void ShaderGen::DeclareFragmentEntryPoint( ss << " " << qualifier << "in float2 v_tex" << i << " : TEXCOORD" << i << ",\n"; u32 additional_counter = num_texcoord_inputs; - for (const auto &[qualifiers, name] : additional_inputs) + for (const auto& [qualifiers, name] : additional_inputs) { const char* qualifier_to_use = (std::strlen(qualifiers) > 0) ? qualifiers : qualifier; ss << " " << qualifier_to_use << " in " << name << " : TEXCOORD" << additional_counter << ",\n"; @@ -536,6 +536,23 @@ void ShaderGen::DeclareFragmentEntryPoint( } } +void ShaderGen::DeclareComputeEntryPoint(std::stringstream& ss, u32 local_size_x, u32 local_size_y, u32 local_size_z) +{ + if (m_glsl) + { + ss << "#define c_local_id gl_LocalInvocationID\n"; + ss << "#define c_global_id gl_GlobalInvocationID\n"; + ss << "layout(local_size_x = " << local_size_x << ", local_size_y = " << local_size_y + << ", local_size_z = " << local_size_z << ") in;\n"; + ss << "void main()\n"; + } + else + { + ss << "[numthreads(" << local_size_x << ", " << local_size_y << ", " << local_size_z << ")]\n"; + ss << "void main(uint3 c_local_id : SV_GroupID, uint3 c_global_id : SV_DispatchThreadID)\n"; + } +} + std::string ShaderGen::GenerateScreenQuadVertexShader() { std::stringstream ss; diff --git a/src/core/shadergen.h b/src/core/shadergen.h index d93a5c73b..fe7bc2a39 100644 --- a/src/core/shadergen.h +++ b/src/core/shadergen.h @@ -40,6 +40,7 @@ protected: const std::initializer_list>& additional_inputs, bool declare_fragcoord = false, u32 num_color_outputs = 1, bool depth_output = false, bool msaa = false, bool ssaa = false, bool declare_sample_id = false); + void DeclareComputeEntryPoint(std::stringstream& ss, u32 local_size_x, u32 local_size_y, u32 local_size_z); HostDisplay::RenderAPI m_render_api; bool m_glsl; diff --git a/src/frontend-common/fullscreen_ui.cpp b/src/frontend-common/fullscreen_ui.cpp index 00b5152ea..15929ae08 100644 --- a/src/frontend-common/fullscreen_ui.cpp +++ b/src/frontend-common/fullscreen_ui.cpp @@ -1970,7 +1970,7 @@ void DrawSettingsWindow() "to the hardware renderers.", &s_settings_copy.gpu_24bit_chroma_smoothing); - MenuHeading("PGXP (Precision Geometry Transform Pipeline"); + MenuHeading("PGXP (Precision Geometry Transform Pipeline)"); settings_changed |= ToggleButton("PGXP Geometry Correction",