From cf6a40fc12a29dfbfaba0efb4465b26ef9ddf82d Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 13 May 2020 04:30:37 -0300 Subject: [PATCH 001/145] vk_rasterizer: Remove buffer check in attribute selection This was a left over from OpenGL when disabled buffers where not properly emulated. We no longer have to assert this as it is checked in vertex buffer initialization. --- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 8b009fc22b..5af2a0d25b 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -879,10 +879,6 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex vertex_input.SetAttribute(index, false, 0, 0, {}, {}); continue; } - - [[maybe_unused]] const auto& buffer = regs.vertex_array[attrib.buffer]; - ASSERT(buffer.IsEnabled()); - vertex_input.SetAttribute(index, true, attrib.buffer, attrib.offset, attrib.type.Value(), attrib.size.Value()); } From 91dddca26eca0c2e00bb2974099908bf26d34f43 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 13 May 2020 04:32:41 -0300 Subject: [PATCH 002/145] vk_rasterizer: Implement constant attributes Constant attributes (in OpenGL known disabled attributes) are not supported on Vulkan, even with extensions. To emulate this behavior we return zero on reads from disabled vertex attributes in shader code. This has no caching cost because attribute formats are not dynamic state on Vulkan and we have to store it in the pipeline cache anyway. - Fixes Animal Crossing: New Horizons terrain borders --- .../renderer_vulkan/vk_pipeline_cache.cpp | 4 ++- .../renderer_vulkan/vk_rasterizer.cpp | 2 +- .../renderer_vulkan/vk_shader_decompiler.cpp | 30 ++++++++++++------- .../renderer_vulkan/vk_shader_decompiler.h | 3 +- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index fe45ed2697..890175d2d0 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -312,7 +312,9 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { ASSERT(point_size != 0.0f); } for (std::size_t i = 0; i < Maxwell::NumVertexAttributes; ++i) { - specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].Type(); + const auto& attribute = fixed_state.vertex_input.attributes[i]; + specialization.enabled_attributes[i] = attribute.enabled.Value() != 0; + specialization.attribute_types[i] = attribute.Type(); } specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 5af2a0d25b..cf15e6d1c7 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -875,7 +875,7 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex for (std::size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) { const auto& attrib = regs.vertex_attrib_format[index]; - if (!attrib.IsValid()) { + if (attrib.IsConstant()) { vertex_input.SetAttribute(index, false, 0, 0, {}, {}); continue; } diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 18678968cb..6ce6bfcb5e 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -731,8 +731,10 @@ private: if (!IsGenericAttribute(index)) { continue; } - const u32 location = GetGenericAttributeLocation(index); + if (!IsAttributeEnabled(location)) { + continue; + } const auto type_descriptor = GetAttributeType(location); Id type; if (IsInputAttributeArray()) { @@ -976,6 +978,10 @@ private: return stage == ShaderType::TesselationControl; } + bool IsAttributeEnabled(u32 location) const { + return stage != ShaderType::Vertex || specialization.enabled_attributes[location]; + } + u32 GetNumInputVertices() const { switch (stage) { case ShaderType::Geometry: @@ -1192,16 +1198,20 @@ private: UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element); return {v_float_zero, Type::Float}; default: - if (IsGenericAttribute(attribute)) { - const u32 location = GetGenericAttributeLocation(attribute); - const auto type_descriptor = GetAttributeType(location); - const Type type = type_descriptor.type; - const Id attribute_id = input_attributes.at(attribute); - const std::vector elements = {element}; - const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements); - return {OpLoad(GetTypeDefinition(type), pointer), type}; + if (!IsGenericAttribute(attribute)) { + break; } - break; + const u32 location = GetGenericAttributeLocation(attribute); + if (!IsAttributeEnabled(location)) { + // Disabled attributes (also known as constant attributes) always return zero. + return {v_float_zero, Type::Float}; + } + const auto type_descriptor = GetAttributeType(location); + const Type type = type_descriptor.type; + const Id attribute_id = input_attributes.at(attribute); + const std::vector elements = {element}; + const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements); + return {OpLoad(GetTypeDefinition(type), pointer), type}; } UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast(attribute)); return {v_float_zero, Type::Float}; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index f4c05ac3cf..b7af263882 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -88,7 +88,8 @@ struct Specialization final { u32 shared_memory_size{}; // Graphics specific - std::optional point_size{}; + std::optional point_size; + std::bitset enabled_attributes; std::array attribute_types{}; bool ndc_minus_one_to_one{}; }; From 4cff5dd1940cf8eded9daa445c16c90c89720a4a Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 17 May 2020 21:24:44 -0400 Subject: [PATCH 003/145] OpenGL: Enable Debug Context and Synchronous debugging when graphics debugging is enabled. This commit aims to help easing debugging of driver crashes without having to modify existing code. --- src/video_core/renderer_opengl/renderer_opengl.cpp | 3 +++ src/yuzu/bootmanager.cpp | 3 +++ src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp | 3 +++ 3 files changed, 9 insertions(+) diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index b2a1797462..acabc34976 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -753,6 +753,9 @@ void RendererOpenGL::RenderScreenshot() { bool RendererOpenGL::Init() { if (GLAD_GL_KHR_debug) { glEnable(GL_DEBUG_OUTPUT); + if (Settings::values.renderer_debug) { + glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); + } glDebugMessageCallback(DebugHandler, nullptr); } diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp index 1adf8932b3..1f5e43043c 100644 --- a/src/yuzu/bootmanager.cpp +++ b/src/yuzu/bootmanager.cpp @@ -106,6 +106,9 @@ public: format.setVersion(4, 3); format.setProfile(QSurfaceFormat::CompatibilityProfile); format.setOption(QSurfaceFormat::FormatOption::DeprecatedFunctions); + if (Settings::values.renderer_debug) { + format.setOption(QSurfaceFormat::FormatOption::DebugContext); + } // TODO: expose a setting for buffer value (ie default/single/double/triple) format.setSwapBehavior(QSurfaceFormat::DefaultSwapBehavior); format.setSwapInterval(0); diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp index 411e7e6472..09cc0a3b59 100644 --- a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp +++ b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp @@ -98,6 +98,9 @@ EmuWindow_SDL2_GL::EmuWindow_SDL2_GL(Core::System& system, bool fullscreen) SDL_GL_SetAttribute(SDL_GL_BLUE_SIZE, 8); SDL_GL_SetAttribute(SDL_GL_ALPHA_SIZE, 0); SDL_GL_SetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, 1); + if (Settings::values.renderer_debug) { + SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG); + } SDL_GL_SetSwapInterval(0); std::string window_title = fmt::format("yuzu {} | {}-{}", Common::g_build_fullname, From ea14af2164934faa0761a88c6236b3652d8cdaea Mon Sep 17 00:00:00 2001 From: Neodyblue Date: Thu, 21 May 2020 17:51:53 -0700 Subject: [PATCH 004/145] qt_themes: remove unknown qss property from dark theme Qdarkstyle's qss file uses an overflow property. According to `https://doc.qt.io/qt-5/stylesheet-reference.html`, the property `overflow` doesn't exist, which leads to a warning message in the console. --- dist/qt_themes/qdarkstyle/style.qss | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dist/qt_themes/qdarkstyle/style.qss b/dist/qt_themes/qdarkstyle/style.qss index 7d088a7197..2d5c9761f6 100644 --- a/dist/qt_themes/qdarkstyle/style.qss +++ b/dist/qt_themes/qdarkstyle/style.qss @@ -673,10 +673,6 @@ QTabWidget::pane { border-bottom-left-radius: 2px; } -QTabWidget::tab-bar { - overflow: visible; -} - QTabBar { qproperty-drawBase: 0; border-radius: 3px; From efe7b7483b9c1fc09b43f40a87e43dab476c619a Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Tue, 26 May 2020 00:57:27 -0300 Subject: [PATCH 005/145] fixed_pipeline_state: Remove unnecessary check for front faces flip The check to flip faces when viewports are negative were a left over from the old OpenGL code. This is not required on Vulkan where we have negative viewports. --- src/video_core/renderer_vulkan/fixed_pipeline_state.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp index 568744e3c1..424278816b 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp @@ -71,8 +71,7 @@ void FixedPipelineState::Rasterizer::Fill(const Maxwell& regs) noexcept { const u32 topology_index = static_cast(regs.draw.topology.Value()); u32 packed_front_face = PackFrontFace(regs.front_face); - if (regs.screen_y_control.triangle_rast_flip != 0 && - regs.viewport_transform[0].scale_y > 0.0f) { + if (regs.screen_y_control.triangle_rast_flip != 0) { // Flip front face packed_front_face = 1 - packed_front_face; } From 606a62d4c740eec470b19e57322645de20866abf Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Tue, 26 May 2020 00:57:38 -0300 Subject: [PATCH 006/145] gl_rasterizer: Port front face flip check from Vulkan While Vulkan was assuming we had no negative viewports, OpenGL code was assuming we had them. Port the old code from Vulkan to OpenGL, checking if the first viewport is negative before flipping faces. This is not a complete implementation since we only check for the first viewport to be negative. That said, unless a game is using Vulkan, OpenGL and NVN games should be fine here, and we can always compare with our Vulkan backend to see if there's a difference. --- .../renderer_opengl/gl_rasterizer.cpp | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 8116a5daa6..efbbf11f92 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -1031,6 +1031,26 @@ void RasterizerOpenGL::SyncViewport() { const auto& regs = gpu.regs; const bool dirty_viewport = flags[Dirty::Viewports]; + const bool dirty_clip_control = flags[Dirty::ClipControl]; + + if (dirty_clip_control || flags[Dirty::FrontFace]) { + flags[Dirty::FrontFace] = false; + + GLenum mode = MaxwellToGL::FrontFace(regs.front_face); + if (regs.screen_y_control.triangle_rast_flip != 0 && + regs.viewport_transform[0].scale_y < 0.0f) { + switch (mode) { + case GL_CW: + mode = GL_CCW; + break; + case GL_CCW: + mode = GL_CW; + break; + } + } + glFrontFace(mode); + } + if (dirty_viewport || flags[Dirty::ClipControl]) { flags[Dirty::ClipControl] = false; @@ -1128,11 +1148,6 @@ void RasterizerOpenGL::SyncCullMode() { glDisable(GL_CULL_FACE); } } - - if (flags[Dirty::FrontFace]) { - flags[Dirty::FrontFace] = false; - glFrontFace(MaxwellToGL::FrontFace(regs.front_face)); - } } void RasterizerOpenGL::SyncPrimitiveRestart() { From 31eb658fea6ecae8fb4f0dde5abdeb1d8eb44ae0 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Tue, 19 May 2020 22:01:25 -0300 Subject: [PATCH 007/145] maxwell_3d: Initialize polygon modes NVN expects this to be initialized as Fill, otherwise games that never bind a rasterizer state will log an invalid polygon mode. --- src/video_core/engines/maxwell_3d.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 024c9e43b5..6113da7f8a 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -107,6 +107,8 @@ void Maxwell3D::InitializeRegisterDefaults() { regs.rt_separate_frag_data = 1; regs.framebuffer_srgb = 1; regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise; + regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill; + regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill; shadow_state = regs; From f3f056c3b60fda242ae08f3866832dd307d537ba Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 20 May 2020 01:13:40 -0300 Subject: [PATCH 008/145] maxwell_3d: Initialize line widths Initialize line widths to avoid setting a line width of zero. --- src/video_core/engines/maxwell_3d.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 6113da7f8a..b5a70b9fca 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -106,6 +106,8 @@ void Maxwell3D::InitializeRegisterDefaults() { regs.rasterize_enable = 1; regs.rt_separate_frag_data = 1; regs.framebuffer_srgb = 1; + regs.line_width_aliased = 1.0f; + regs.line_width_smooth = 1.0f; regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise; regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill; regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill; From d2b25575426b9b52049b88d8d6d9ae83c81da312 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 27 May 2020 17:31:12 -0300 Subject: [PATCH 009/145] texture_cache: Use small vector for surface vectors This avoids most heap allocations when collecting surfaces into a vector. --- src/video_core/texture_cache/texture_cache.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index d6efc34b20..d7e42697da 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -53,6 +54,7 @@ using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig; template class TextureCache { + using VectorSurface = boost::container::small_vector; public: void InvalidateRegion(VAddr addr, std::size_t size) { @@ -498,7 +500,7 @@ private: * @param untopological Indicates to the recycler that the texture has no way * to match the overlaps due to topological reasons. **/ - RecycleStrategy PickStrategy(std::vector& overlaps, const SurfaceParams& params, + RecycleStrategy PickStrategy(VectorSurface& overlaps, const SurfaceParams& params, const GPUVAddr gpu_addr, const MatchTopologyResult untopological) { if (Settings::IsGPULevelExtreme()) { return RecycleStrategy::Flush; @@ -538,9 +540,8 @@ private: * @param untopological Indicates to the recycler that the texture has no way to match the * overlaps due to topological reasons. **/ - std::pair RecycleSurface(std::vector& overlaps, - const SurfaceParams& params, const GPUVAddr gpu_addr, - const bool preserve_contents, + std::pair RecycleSurface(VectorSurface& overlaps, const SurfaceParams& params, + const GPUVAddr gpu_addr, const bool preserve_contents, const MatchTopologyResult untopological) { const bool do_load = preserve_contents && Settings::IsGPULevelExtreme(); for (auto& surface : overlaps) { @@ -650,7 +651,7 @@ private: * @param params The parameters on the new surface. * @param gpu_addr The starting address of the new surface. **/ - std::optional> TryReconstructSurface(std::vector& overlaps, + std::optional> TryReconstructSurface(VectorSurface& overlaps, const SurfaceParams& params, const GPUVAddr gpu_addr) { if (params.target == SurfaceTarget::Texture3D) { @@ -708,7 +709,7 @@ private: * @param preserve_contents Indicates that the new surface should be loaded from memory or * left blank. */ - std::optional> Manage3DSurfaces(std::vector& overlaps, + std::optional> Manage3DSurfaces(VectorSurface& overlaps, const SurfaceParams& params, const GPUVAddr gpu_addr, const VAddr cpu_addr, @@ -810,7 +811,7 @@ private: TSurface& current_surface = iter->second; const auto topological_result = current_surface->MatchesTopology(params); if (topological_result != MatchTopologyResult::FullMatch) { - std::vector overlaps{current_surface}; + VectorSurface overlaps{current_surface}; return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, topological_result); } @@ -1124,14 +1125,14 @@ private: } } - std::vector GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) { + VectorSurface GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) { if (size == 0) { return {}; } const VAddr cpu_addr_end = cpu_addr + size; VAddr start = cpu_addr >> registry_page_bits; const VAddr end = (cpu_addr_end - 1) >> registry_page_bits; - std::vector surfaces; + VectorSurface surfaces; while (start <= end) { std::vector& list = registry[start]; for (auto& surface : list) { From b8b6f94ba9a662857c40d819ac40755c7984cb16 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 27 May 2020 17:59:04 -0300 Subject: [PATCH 010/145] texture_cache: Use unordered_map::find instead of operator[] on hot code --- src/video_core/texture_cache/texture_cache.h | 36 +++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index d7e42697da..99f74e6c4e 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -310,18 +310,20 @@ public: dst_surface.first->MarkAsModified(true, Tick()); } - TSurface TryFindFramebufferSurface(VAddr addr) { + TSurface TryFindFramebufferSurface(VAddr addr) const { if (!addr) { return nullptr; } const VAddr page = addr >> registry_page_bits; - std::vector& list = registry[page]; - for (auto& surface : list) { - if (surface->GetCpuAddr() == addr) { - return surface; - } + const auto it = registry.find(page); + if (it == registry.end()) { + return nullptr; } - return nullptr; + const auto& list = it->second; + const auto found = std::find_if(list.begin(), list.end(), [addr](const auto& surface) { + return surface->GetCpuAddr() == addr; + }); + return found != list.end() ? *found : nullptr; } u64 Tick() { @@ -1130,18 +1132,20 @@ private: return {}; } const VAddr cpu_addr_end = cpu_addr + size; - VAddr start = cpu_addr >> registry_page_bits; const VAddr end = (cpu_addr_end - 1) >> registry_page_bits; VectorSurface surfaces; - while (start <= end) { - std::vector& list = registry[start]; - for (auto& surface : list) { - if (!surface->IsPicked() && surface->Overlaps(cpu_addr, cpu_addr_end)) { - surface->MarkAsPicked(true); - surfaces.push_back(surface); - } + for (VAddr start = cpu_addr >> registry_page_bits; start <= end; ++start) { + const auto it = registry.find(start); + if (it == registry.end()) { + continue; + } + for (auto& surface : it->second) { + if (surface->IsPicked() || !surface->Overlaps(cpu_addr, cpu_addr_end)) { + continue; + } + surface->MarkAsPicked(true); + surfaces.push_back(surface); } - start++; } for (auto& surface : surfaces) { surface->MarkAsPicked(false); From 3b2dee88e67eaa968fd0dba19d10871e0b6570b9 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 27 May 2020 23:05:50 -0300 Subject: [PATCH 011/145] buffer_cache: Avoid copying twice on certain cases Avoid copying to a staging buffer on non-granular memory addresses. Add a callable argument to StreamBufferUpload to be able to copy to the staging buffer directly from ReadBlockUnsafe. --- src/video_core/buffer_cache/buffer_cache.h | 40 +++++++++++++--------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index d9a4a1b4d1..b88fce2cd5 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -56,24 +56,28 @@ public: if (use_fast_cbuf || size < max_stream_size) { if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) { auto& memory_manager = system.GPU().MemoryManager(); + const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size); if (use_fast_cbuf) { - if (memory_manager.IsGranularRange(gpu_addr, size)) { - const auto host_ptr = memory_manager.GetPointer(gpu_addr); - return ConstBufferUpload(host_ptr, size); + u8* dest; + if (is_granular) { + dest = memory_manager.GetPointer(gpu_addr); } else { staging_buffer.resize(size); - memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); - return ConstBufferUpload(staging_buffer.data(), size); + dest = staging_buffer.data(); + memory_manager.ReadBlockUnsafe(gpu_addr, dest, size); } + return ConstBufferUpload(dest, size); + } + if (is_granular) { + u8* const host_ptr = memory_manager.GetPointer(gpu_addr); + return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) { + std::memcpy(dest, host_ptr, size); + }); } else { - if (memory_manager.IsGranularRange(gpu_addr, size)) { - const auto host_ptr = memory_manager.GetPointer(gpu_addr); - return StreamBufferUpload(host_ptr, size, alignment); - } else { - staging_buffer.resize(size); - memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); - return StreamBufferUpload(staging_buffer.data(), size, alignment); - } + return StreamBufferUpload( + size, alignment, [&memory_manager, gpu_addr, size](u8* dest) { + memory_manager.ReadBlockUnsafe(gpu_addr, dest, size); + }); } } } @@ -101,7 +105,9 @@ public: BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4) { std::lock_guard lock{mutex}; - return StreamBufferUpload(raw_pointer, size, alignment); + return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) { + std::memcpy(dest, raw_pointer, size); + }); } void Map(std::size_t max_size) { @@ -424,11 +430,11 @@ private: map->MarkAsModified(false, 0); } - BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size, - std::size_t alignment) { + template + BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) { AlignBuffer(alignment); const std::size_t uploaded_offset = buffer_offset; - std::memcpy(buffer_ptr, raw_pointer, size); + callable(buffer_ptr); buffer_ptr += size; buffer_offset += size; From fc153f6bcd9884064eb2752fc7be5a6458bbd71b Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 28 May 2020 17:08:44 -0300 Subject: [PATCH 012/145] format_lookup_table: Implement G24S8 format as S8Z24 --- src/video_core/texture_cache/format_lookup_table.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index 7032e00594..f476f03b01 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -41,7 +41,7 @@ struct Table { ComponentType alpha_component; bool is_srgb; }; -constexpr std::array DefinitionTable = {{ +constexpr std::array DefinitionTable = {{ {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U}, {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S}, {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI}, @@ -98,6 +98,7 @@ constexpr std::array DefinitionTable = {{ {TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F}, {TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16}, {TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24}, + {TextureFormat::G24R8, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24}, {TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8}, {TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1}, From 43bf860b2237cfa873e2452dd8d0b5f43193a05e Mon Sep 17 00:00:00 2001 From: David Marcec Date: Fri, 29 May 2020 13:48:01 +1000 Subject: [PATCH 013/145] kernel: ResourceLimit::Reserve remove useless while loop Timeout is a u64, it will always be >= 0 --- src/core/hle/kernel/resource_limit.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/core/hle/kernel/resource_limit.cpp b/src/core/hle/kernel/resource_limit.cpp index d9beaa3a40..212e442f42 100644 --- a/src/core/hle/kernel/resource_limit.cpp +++ b/src/core/hle/kernel/resource_limit.cpp @@ -24,13 +24,9 @@ bool ResourceLimit::Reserve(ResourceType resource, s64 amount, u64 timeout) { const std::size_t index{ResourceTypeToIndex(resource)}; s64 new_value = current[index] + amount; - while (new_value > limit[index] && available[index] + amount <= limit[index]) { + if (new_value > limit[index] && available[index] + amount <= limit[index]) { // TODO(bunnei): This is wrong for multicore, we should wait the calling thread for timeout new_value = current[index] + amount; - - if (timeout >= 0) { - break; - } } if (new_value <= limit[index]) { From 5b37cecd76205612bfc2cc1d0b475d893fe7ee6a Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 29 May 2020 01:48:01 -0300 Subject: [PATCH 014/145] texture_cache: Handle overlaps with multiple subresources Implement more surface reconstruct cases. Allow overlaps with more than one layer and mipmap and copies all of them to the new texture. - Fixes textures moving around objects on Xenoblade games --- src/video_core/texture_cache/texture_cache.h | 62 +++++++++++--------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 8bfc541d49..658264860a 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -652,45 +652,54 @@ private: **/ std::optional> TryReconstructSurface(std::vector& overlaps, const SurfaceParams& params, - const GPUVAddr gpu_addr) { + GPUVAddr gpu_addr) { if (params.target == SurfaceTarget::Texture3D) { - return {}; + return std::nullopt; } - bool modified = false; TSurface new_surface = GetUncachedSurface(gpu_addr, params); - u32 passed_tests = 0; + std::size_t passed_tests = 0; + bool modified = false; + for (auto& surface : overlaps) { const SurfaceParams& src_params = surface->GetSurfaceParams(); - if (src_params.is_layered || src_params.num_levels > 1) { - // We send this cases to recycle as they are more complex to handle - return {}; - } - const std::size_t candidate_size = surface->GetSizeInBytes(); - auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())}; + const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())}; if (!mipmap_layer) { continue; } - const auto [layer, mipmap] = *mipmap_layer; - if (new_surface->GetMipmapSize(mipmap) != candidate_size) { + const auto [base_layer, base_mipmap] = *mipmap_layer; + if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) { continue; } + + // Copy all mipmaps and layers + const u32 block_width = params.GetDefaultBlockWidth(); + const u32 block_height = params.GetDefaultBlockHeight(); + for (u32 mipmap = base_mipmap; mipmap < base_mipmap + src_params.num_levels; ++mipmap) { + const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap); + const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap); + if (width < block_width || height < block_height) { + // Current APIs forbid copying small compressed textures, avoid errors + break; + } + const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height, + src_params.depth); + ImageCopy(surface, new_surface, copy_params); + } + ++passed_tests; modified |= surface->IsModified(); - // Now we got all the data set up - const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap); - const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap); - const CopyParams copy_params(0, 0, 0, 0, 0, layer, 0, mipmap, width, height, 1); - passed_tests++; - ImageCopy(surface, new_surface, copy_params); } if (passed_tests == 0) { - return {}; - // In Accurate GPU all tests should pass, else we recycle - } else if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) { - return {}; + return std::nullopt; } + if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) { + // In Accurate GPU all tests should pass, else we recycle + return std::nullopt; + } + for (const auto& surface : overlaps) { Unregister(surface); } + new_surface->MarkAsModified(modified, Tick()); Register(new_surface); return {{new_surface, new_surface->GetMainView()}}; @@ -868,12 +877,9 @@ private: // two things either the candidate surface is a supertexture of the overlap // or they don't match in any known way. if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) { - if (current_surface->GetGpuAddr() == gpu_addr) { - std::optional> view = - TryReconstructSurface(overlaps, params, gpu_addr); - if (view) { - return *view; - } + const std::optional view = TryReconstructSurface(overlaps, params, gpu_addr); + if (view) { + return *view; } return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, MatchTopologyResult::FullMatch); From 5616be12bef10a7eb15c4a1391954d9179f616d1 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 29 May 2020 03:05:26 -0300 Subject: [PATCH 015/145] vk_rasterizer: Skip transform feedbacks when extension is unavailable Avoids calling transform feedback procedures when VK_EXT_transform_feedback is not available. --- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index be5b77fae5..882751f09b 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -838,6 +838,10 @@ void RasterizerVulkan::BeginTransformFeedback() { if (regs.tfb_enabled == 0) { return; } + if (!device.IsExtTransformFeedbackSupported()) { + LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); + return; + } UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || @@ -866,6 +870,9 @@ void RasterizerVulkan::EndTransformFeedback() { if (regs.tfb_enabled == 0) { return; } + if (!device.IsExtTransformFeedbackSupported()) { + return; + } scheduler.Record( [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); From dd70e097ccb84b64983456759525d650d1ceab0a Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 29 May 2020 20:10:58 -0300 Subject: [PATCH 016/145] texture_cache: Reload textures when number of resources mismatch --- src/video_core/texture_cache/texture_cache.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 658264860a..62206b9060 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -660,6 +660,15 @@ private: std::size_t passed_tests = 0; bool modified = false; + u32 num_resources = 0; + for (auto& surface : overlaps) { + const SurfaceParams& src_params = surface->GetSurfaceParams(); + num_resources += src_params.depth * src_params.num_levels; + } + if (num_resources != params.depth * params.num_levels) { + LoadSurface(new_surface); + } + for (auto& surface : overlaps) { const SurfaceParams& src_params = surface->GetSurfaceParams(); const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())}; From e454f7e7a7d75fe0415ce48ffbd5d5979d79ce67 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 29 May 2020 20:12:46 -0300 Subject: [PATCH 017/145] texture_cache: Only copy textures that were modified from host --- src/video_core/texture_cache/texture_cache.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 62206b9060..3e024a0981 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -679,6 +679,12 @@ private: if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) { continue; } + ++passed_tests; + + if (!surface->IsModified()) { + continue; + } + modified = true; // Copy all mipmaps and layers const u32 block_width = params.GetDefaultBlockWidth(); @@ -694,8 +700,6 @@ private: src_params.depth); ImageCopy(surface, new_surface, copy_params); } - ++passed_tests; - modified |= surface->IsModified(); } if (passed_tests == 0) { return std::nullopt; From b032ebdfee1928c4458eaf15faa0cff299371e65 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Fri, 29 May 2020 14:53:27 +1000 Subject: [PATCH 018/145] Implement macro JIT --- src/core/settings.h | 1 + src/video_core/CMakeLists.txt | 8 +- src/video_core/engines/maxwell_3d.cpp | 19 +- src/video_core/engines/maxwell_3d.h | 19 +- src/video_core/macro/macro.cpp | 45 ++ src/video_core/macro/macro.h | 128 ++++ .../{ => macro}/macro_interpreter.cpp | 200 ++---- .../{ => macro}/macro_interpreter.h | 51 +- src/video_core/macro/macro_jit_x64.cpp | 633 ++++++++++++++++++ src/video_core/macro/macro_jit_x64.h | 98 +++ src/yuzu/configuration/config.cpp | 3 + src/yuzu/configuration/configure_debug.cpp | 3 + src/yuzu/configuration/configure_debug.ui | 13 + src/yuzu_cmd/config.cpp | 2 + src/yuzu_cmd/default_ini.h | 2 + 15 files changed, 1035 insertions(+), 190 deletions(-) create mode 100644 src/video_core/macro/macro.cpp create mode 100644 src/video_core/macro/macro.h rename src/video_core/{ => macro}/macro_interpreter.cpp (63%) rename src/video_core/{ => macro}/macro_interpreter.h (69%) create mode 100644 src/video_core/macro/macro_jit_x64.cpp create mode 100644 src/video_core/macro/macro_jit_x64.h diff --git a/src/core/settings.h b/src/core/settings.h index 78eb33737d..36cd66fd4c 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -474,6 +474,7 @@ struct Values { bool reporting_services; bool quest_flag; bool disable_cpu_opt; + bool disable_macro_jit; // BCAT std::string bcat_backend; diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index d6ee828369..2bf8d68ce0 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -25,6 +25,12 @@ add_library(video_core STATIC engines/shader_bytecode.h engines/shader_header.h engines/shader_type.h + macro/macro.cpp + macro/macro.h + macro/macro_interpreter.cpp + macro/macro_interpreter.h + macro/macro_jit_x64.cpp + macro/macro_jit_x64.h fence_manager.h gpu.cpp gpu.h @@ -36,8 +42,6 @@ add_library(video_core STATIC gpu_thread.h guest_driver.cpp guest_driver.h - macro_interpreter.cpp - macro_interpreter.h memory_manager.cpp memory_manager.h morton.cpp diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 004f6b2615..934a1d6dbd 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -25,9 +25,8 @@ constexpr u32 MacroRegistersStart = 0xE00; Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, - macro_interpreter{*this}, upload_state{memory_manager, regs.upload} { + macro_engine(GetMacroEngine(*this)), upload_state{memory_manager, regs.upload} { dirty.flags.flip(); - InitializeRegisterDefaults(); } @@ -116,7 +115,7 @@ void Maxwell3D::InitializeRegisterDefaults() { mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; } -void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) { +void Maxwell3D::CallMacroMethod(u32 method, std::vector&& parameters) { // Reset the current macro. executing_macro = 0; @@ -125,7 +124,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3 ((method - MacroRegistersStart) >> 1) % static_cast(macro_positions.size()); // Execute the current macro. - macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters); + macro_engine->Execute(macro_positions[entry], std::move(parameters)); if (mme_draw.current_mode != MMEDrawMode::Undefined) { FlushMMEInlineDraw(); } @@ -161,8 +160,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { // Call the macro when there are no more parameters in the command buffer if (is_last_call) { - CallMacroMethod(executing_macro, macro_params.size(), macro_params.data()); - macro_params.clear(); + CallMacroMethod(executing_macro, std::move(macro_params)); } return; } @@ -197,7 +195,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { break; } case MAXWELL3D_REG_INDEX(macros.data): { - ProcessMacroUpload(arg); + macro_engine->AddCode(regs.macros.upload_address, arg); break; } case MAXWELL3D_REG_INDEX(macros.bind): { @@ -306,8 +304,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, // Call the macro when there are no more parameters in the command buffer if (amount == methods_pending) { - CallMacroMethod(executing_macro, macro_params.size(), macro_params.data()); - macro_params.clear(); + CallMacroMethod(executing_macro, std::move(macro_params)); } return; } @@ -420,9 +417,7 @@ void Maxwell3D::FlushMMEInlineDraw() { } void Maxwell3D::ProcessMacroUpload(u32 data) { - ASSERT_MSG(regs.macros.upload_address < macro_memory.size(), - "upload_address exceeded macro_memory size!"); - macro_memory[regs.macros.upload_address++] = data; + macro_engine->AddCode(regs.macros.upload_address++, data); } void Maxwell3D::ProcessMacroBind(u32 data) { diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 05dd6b39bf..077bc9841b 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -23,7 +23,7 @@ #include "video_core/engines/engine_upload.h" #include "video_core/engines/shader_type.h" #include "video_core/gpu.h" -#include "video_core/macro_interpreter.h" +#include "video_core/macro/macro.h" #include "video_core/textures/texture.h" namespace Core { @@ -1411,15 +1411,6 @@ public: const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override; - /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than - /// we've seen used. - using MacroMemory = std::array; - - /// Gets a reference to macro memory. - const MacroMemory& GetMacroMemory() const { - return macro_memory; - } - bool ShouldExecute() const { return execute_on; } @@ -1468,16 +1459,14 @@ private: std::array mme_inline{}; - /// Memory for macro code - MacroMemory macro_memory; - /// Macro method that is currently being executed / being fed parameters. u32 executing_macro = 0; /// Parameters that have been submitted to the macro call so far. std::vector macro_params; /// Interpreter for the macro codes uploaded to the GPU. - MacroInterpreter macro_interpreter; + std::unique_ptr macro_engine; + // MacroInterpreter macro_interpreter; static constexpr u32 null_cb_data = 0xFFFFFFFF; struct { @@ -1506,7 +1495,7 @@ private: * @param num_parameters Number of arguments * @param parameters Arguments to the method call */ - void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters); + void CallMacroMethod(u32 method, std::vector&& parameters); /// Handles writes to the macro uploading register. void ProcessMacroUpload(u32 data); diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp new file mode 100644 index 0000000000..85a6e5dd42 --- /dev/null +++ b/src/video_core/macro/macro.cpp @@ -0,0 +1,45 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/logging/log.h" +#include "core/settings.h" +#include "video_core/macro/macro.h" +#include "video_core/macro/macro_interpreter.h" +#include "video_core/macro/macro_jit_x64.h" + +namespace Tegra { + +void MacroEngine::AddCode(u32 method, u32 data) { + uploaded_macro_code[method].push_back(data); +} + +void MacroEngine::Execute(u32 method, std::vector parameters) { + auto compiled_macro = macro_cache.find(method); + if (compiled_macro != macro_cache.end()) { + compiled_macro->second->Execute(parameters, method); + } else { + // Macro not compiled, check if it's uploaded and if so, compile it + auto macro_code = uploaded_macro_code.find(method); + if (macro_code == uploaded_macro_code.end()) { + UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method); + return; + } + macro_cache[method] = Compile(macro_code->second); + macro_cache[method]->Execute(parameters, method); + } +} + +std::unique_ptr GetMacroEngine(Engines::Maxwell3D& maxwell3d) { + if (Settings::values.disable_macro_jit) { + return std::make_unique(maxwell3d); + } +#ifdef ARCHITECTURE_x86_64 + return std::make_unique(maxwell3d); +#else + return std::make_unique(maxwell3d); +#endif +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h new file mode 100644 index 0000000000..28ca243d10 --- /dev/null +++ b/src/video_core/macro/macro.h @@ -0,0 +1,128 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include "common/bit_field.h" +#include "common/common_types.h" + +namespace Tegra { +namespace Engines { +class Maxwell3D; +} +namespace Macro { +constexpr std::size_t NUM_MACRO_REGISTERS = 8; +enum class Operation : u32 { + ALU = 0, + AddImmediate = 1, + ExtractInsert = 2, + ExtractShiftLeftImmediate = 3, + ExtractShiftLeftRegister = 4, + Read = 5, + Unused = 6, // This operation doesn't seem to be a valid encoding. + Branch = 7, +}; + +enum class ALUOperation : u32 { + Add = 0, + AddWithCarry = 1, + Subtract = 2, + SubtractWithBorrow = 3, + // Operations 4-7 don't seem to be valid encodings. + Xor = 8, + Or = 9, + And = 10, + AndNot = 11, + Nand = 12 +}; + +enum class ResultOperation : u32 { + IgnoreAndFetch = 0, + Move = 1, + MoveAndSetMethod = 2, + FetchAndSend = 3, + MoveAndSend = 4, + FetchAndSetMethod = 5, + MoveAndSetMethodFetchAndSend = 6, + MoveAndSetMethodSend = 7 +}; + +enum class BranchCondition : u32 { + Zero = 0, + NotZero = 1, +}; + +union Opcode { + u32 raw; + BitField<0, 3, Operation> operation; + BitField<4, 3, ResultOperation> result_operation; + BitField<4, 1, BranchCondition> branch_condition; + // If set on a branch, then the branch doesn't have a delay slot. + BitField<5, 1, u32> branch_annul; + BitField<7, 1, u32> is_exit; + BitField<8, 3, u32> dst; + BitField<11, 3, u32> src_a; + BitField<14, 3, u32> src_b; + // The signed immediate overlaps the second source operand and the alu operation. + BitField<14, 18, s32> immediate; + + BitField<17, 5, ALUOperation> alu_operation; + + // Bitfield instructions data + BitField<17, 5, u32> bf_src_bit; + BitField<22, 5, u32> bf_size; + BitField<27, 5, u32> bf_dst_bit; + + u32 GetBitfieldMask() const { + return (1 << bf_size) - 1; + } + + s32 GetBranchTarget() const { + return static_cast(immediate * sizeof(u32)); + } +}; + +union MethodAddress { + u32 raw; + BitField<0, 12, u32> address; + BitField<12, 6, u32> increment; +}; + +} // namespace Macro + +class CachedMacro { +public: + virtual ~CachedMacro() = default; + /** + * Executes the macro code with the specified input parameters. + * @param code The macro byte code to execute + * @param parameters The parameters of the macro + */ + virtual void Execute(std::vector& parameters, u32 method) = 0; +}; + +class MacroEngine { +public: + virtual ~MacroEngine() = default; + + // Store the uploaded macro code to compile them when they're called. + void AddCode(u32 method, u32 data); + + // Compiles the macro if its not in the cache, and executes the compiled macro + void Execute(u32 method, std::vector parameters); + +protected: + virtual std::unique_ptr Compile(const std::vector& code) = 0; + +private: + std::unordered_map> macro_cache; + std::unordered_map> uploaded_macro_code; +}; + +std::unique_ptr GetMacroEngine(Engines::Maxwell3D& maxwell3d); + +} // namespace Tegra diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp similarity index 63% rename from src/video_core/macro_interpreter.cpp rename to src/video_core/macro/macro_interpreter.cpp index 9473649282..e63296a21f 100644 --- a/src/video_core/macro_interpreter.cpp +++ b/src/video_core/macro/macro_interpreter.cpp @@ -1,4 +1,4 @@ -// Copyright 2018 yuzu Emulator Project +// Copyright 2020 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. @@ -6,109 +6,46 @@ #include "common/logging/log.h" #include "common/microprofile.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/macro_interpreter.h" +#include "video_core/macro/macro_interpreter.h" MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192)); namespace Tegra { -namespace { -enum class Operation : u32 { - ALU = 0, - AddImmediate = 1, - ExtractInsert = 2, - ExtractShiftLeftImmediate = 3, - ExtractShiftLeftRegister = 4, - Read = 5, - Unused = 6, // This operation doesn't seem to be a valid encoding. - Branch = 7, -}; -} // Anonymous namespace - -enum class MacroInterpreter::ALUOperation : u32 { - Add = 0, - AddWithCarry = 1, - Subtract = 2, - SubtractWithBorrow = 3, - // Operations 4-7 don't seem to be valid encodings. - Xor = 8, - Or = 9, - And = 10, - AndNot = 11, - Nand = 12 -}; - -enum class MacroInterpreter::ResultOperation : u32 { - IgnoreAndFetch = 0, - Move = 1, - MoveAndSetMethod = 2, - FetchAndSend = 3, - MoveAndSend = 4, - FetchAndSetMethod = 5, - MoveAndSetMethodFetchAndSend = 6, - MoveAndSetMethodSend = 7 -}; - -enum class MacroInterpreter::BranchCondition : u32 { - Zero = 0, - NotZero = 1, -}; - -union MacroInterpreter::Opcode { - u32 raw; - BitField<0, 3, Operation> operation; - BitField<4, 3, ResultOperation> result_operation; - BitField<4, 1, BranchCondition> branch_condition; - // If set on a branch, then the branch doesn't have a delay slot. - BitField<5, 1, u32> branch_annul; - BitField<7, 1, u32> is_exit; - BitField<8, 3, u32> dst; - BitField<11, 3, u32> src_a; - BitField<14, 3, u32> src_b; - // The signed immediate overlaps the second source operand and the alu operation. - BitField<14, 18, s32> immediate; - - BitField<17, 5, ALUOperation> alu_operation; - - // Bitfield instructions data - BitField<17, 5, u32> bf_src_bit; - BitField<22, 5, u32> bf_size; - BitField<27, 5, u32> bf_dst_bit; - - u32 GetBitfieldMask() const { - return (1 << bf_size) - 1; - } - - s32 GetBranchTarget() const { - return static_cast(immediate * sizeof(u32)); - } -}; - MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} -void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) { +std::unique_ptr MacroInterpreter::Compile(const std::vector& code) { + return std::make_unique(maxwell3d, code); +} + +MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, + const std::vector& code) + : maxwell3d(maxwell3d), code(code) {} + +void MacroInterpreterImpl::Execute(std::vector& parameters, u32 method) { MICROPROFILE_SCOPE(MacroInterp); Reset(); registers[1] = parameters[0]; + num_parameters = parameters.size(); if (num_parameters > parameters_capacity) { parameters_capacity = num_parameters; this->parameters = std::make_unique(num_parameters); } - std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32)); + std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32)); this->num_parameters = num_parameters; // Execute the code until we hit an exit condition. bool keep_executing = true; while (keep_executing) { - keep_executing = Step(offset, false); + keep_executing = Step(false); } // Assert the the macro used all the input parameters ASSERT(next_parameter_index == num_parameters); } -void MacroInterpreter::Reset() { +void MacroInterpreterImpl::Reset() { registers = {}; pc = 0; delayed_pc = {}; @@ -120,10 +57,10 @@ void MacroInterpreter::Reset() { carry_flag = false; } -bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { +bool MacroInterpreterImpl::Step(bool is_delay_slot) { u32 base_address = pc; - Opcode opcode = GetOpcode(offset); + Macro::Opcode opcode = GetOpcode(); pc += 4; // Update the program counter if we were delayed @@ -134,18 +71,18 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { } switch (opcode.operation) { - case Operation::ALU: { + case Macro::Operation::ALU: { u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a), GetRegister(opcode.src_b)); ProcessResult(opcode.result_operation, opcode.dst, result); break; } - case Operation::AddImmediate: { + case Macro::Operation::AddImmediate: { ProcessResult(opcode.result_operation, opcode.dst, GetRegister(opcode.src_a) + opcode.immediate); break; } - case Operation::ExtractInsert: { + case Macro::Operation::ExtractInsert: { u32 dst = GetRegister(opcode.src_a); u32 src = GetRegister(opcode.src_b); @@ -155,7 +92,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { ProcessResult(opcode.result_operation, opcode.dst, dst); break; } - case Operation::ExtractShiftLeftImmediate: { + case Macro::Operation::ExtractShiftLeftImmediate: { u32 dst = GetRegister(opcode.src_a); u32 src = GetRegister(opcode.src_b); @@ -164,7 +101,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { ProcessResult(opcode.result_operation, opcode.dst, result); break; } - case Operation::ExtractShiftLeftRegister: { + case Macro::Operation::ExtractShiftLeftRegister: { u32 dst = GetRegister(opcode.src_a); u32 src = GetRegister(opcode.src_b); @@ -173,12 +110,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { ProcessResult(opcode.result_operation, opcode.dst, result); break; } - case Operation::Read: { + case Macro::Operation::Read: { u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate); ProcessResult(opcode.result_operation, opcode.dst, result); break; } - case Operation::Branch: { + case Macro::Operation::Branch: { ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); u32 value = GetRegister(opcode.src_a); bool taken = EvaluateBranchCondition(opcode.branch_condition, value); @@ -191,7 +128,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { delayed_pc = base_address + opcode.GetBranchTarget(); // Execute one more instruction due to the delay slot. - return Step(offset, true); + return Step(true); } break; } @@ -204,51 +141,44 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { // cause an exit if it's executed inside a delay slot. if (opcode.is_exit && !is_delay_slot) { // Exit has a delay slot, execute the next instruction - Step(offset, true); + Step(true); return false; } return true; } -MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const { - const auto& macro_memory{maxwell3d.GetMacroMemory()}; - ASSERT((pc % sizeof(u32)) == 0); - ASSERT((pc + offset) < macro_memory.size() * sizeof(u32)); - return {macro_memory[offset + pc / sizeof(u32)]}; -} - -u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) { +u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) { switch (operation) { - case ALUOperation::Add: { + case Macro::ALUOperation::Add: { const u64 result{static_cast(src_a) + src_b}; carry_flag = result > 0xffffffff; return static_cast(result); } - case ALUOperation::AddWithCarry: { + case Macro::ALUOperation::AddWithCarry: { const u64 result{static_cast(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)}; carry_flag = result > 0xffffffff; return static_cast(result); } - case ALUOperation::Subtract: { + case Macro::ALUOperation::Subtract: { const u64 result{static_cast(src_a) - src_b}; carry_flag = result < 0x100000000; return static_cast(result); } - case ALUOperation::SubtractWithBorrow: { + case Macro::ALUOperation::SubtractWithBorrow: { const u64 result{static_cast(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)}; carry_flag = result < 0x100000000; return static_cast(result); } - case ALUOperation::Xor: + case Macro::ALUOperation::Xor: return src_a ^ src_b; - case ALUOperation::Or: + case Macro::ALUOperation::Or: return src_a | src_b; - case ALUOperation::And: + case Macro::ALUOperation::And: return src_a & src_b; - case ALUOperation::AndNot: + case Macro::ALUOperation::AndNot: return src_a & ~src_b; - case ALUOperation::Nand: + case Macro::ALUOperation::Nand: return ~(src_a & src_b); default: @@ -257,43 +187,43 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) } } -void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) { +void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) { switch (operation) { - case ResultOperation::IgnoreAndFetch: + case Macro::ResultOperation::IgnoreAndFetch: // Fetch parameter and ignore result. SetRegister(reg, FetchParameter()); break; - case ResultOperation::Move: + case Macro::ResultOperation::Move: // Move result. SetRegister(reg, result); break; - case ResultOperation::MoveAndSetMethod: + case Macro::ResultOperation::MoveAndSetMethod: // Move result and use as Method Address. SetRegister(reg, result); SetMethodAddress(result); break; - case ResultOperation::FetchAndSend: + case Macro::ResultOperation::FetchAndSend: // Fetch parameter and send result. SetRegister(reg, FetchParameter()); Send(result); break; - case ResultOperation::MoveAndSend: + case Macro::ResultOperation::MoveAndSend: // Move and send result. SetRegister(reg, result); Send(result); break; - case ResultOperation::FetchAndSetMethod: + case Macro::ResultOperation::FetchAndSetMethod: // Fetch parameter and use result as Method Address. SetRegister(reg, FetchParameter()); SetMethodAddress(result); break; - case ResultOperation::MoveAndSetMethodFetchAndSend: + case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: // Move result and use as Method Address, then fetch and send parameter. SetRegister(reg, result); SetMethodAddress(result); Send(FetchParameter()); break; - case ResultOperation::MoveAndSetMethodSend: + case Macro::ResultOperation::MoveAndSetMethodSend: // Move result and use as Method Address, then send bits 12:17 of result. SetRegister(reg, result); SetMethodAddress(result); @@ -304,16 +234,28 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res } } -u32 MacroInterpreter::FetchParameter() { - ASSERT(next_parameter_index < num_parameters); - return parameters[next_parameter_index++]; +bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const { + switch (cond) { + case Macro::BranchCondition::Zero: + return value == 0; + case Macro::BranchCondition::NotZero: + return value != 0; + } + UNREACHABLE(); + return true; } -u32 MacroInterpreter::GetRegister(u32 register_id) const { +Macro::Opcode MacroInterpreterImpl::GetOpcode() const { + ASSERT((pc % sizeof(u32)) == 0); + ASSERT(pc < code.size() * sizeof(u32)); + return {code[pc / sizeof(u32)]}; +} + +u32 MacroInterpreterImpl::GetRegister(u32 register_id) const { return registers.at(register_id); } -void MacroInterpreter::SetRegister(u32 register_id, u32 value) { +void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) { // Register 0 is hardwired as the zero register. // Ensure no writes to it actually occur. if (register_id == 0) { @@ -323,30 +265,24 @@ void MacroInterpreter::SetRegister(u32 register_id, u32 value) { registers.at(register_id) = value; } -void MacroInterpreter::SetMethodAddress(u32 address) { +void MacroInterpreterImpl::SetMethodAddress(u32 address) { method_address.raw = address; } -void MacroInterpreter::Send(u32 value) { +void MacroInterpreterImpl::Send(u32 value) { maxwell3d.CallMethodFromMME(method_address.address, value); // Increment the method address by the method increment. method_address.address.Assign(method_address.address.Value() + method_address.increment.Value()); } -u32 MacroInterpreter::Read(u32 method) const { +u32 MacroInterpreterImpl::Read(u32 method) const { return maxwell3d.GetRegisterValue(method); } -bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const { - switch (cond) { - case BranchCondition::Zero: - return value == 0; - case BranchCondition::NotZero: - return value != 0; - } - UNREACHABLE(); - return true; +u32 MacroInterpreterImpl::FetchParameter() { + ASSERT(next_parameter_index < num_parameters); + return parameters[next_parameter_index++]; } } // namespace Tegra diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h similarity index 69% rename from src/video_core/macro_interpreter.h rename to src/video_core/macro/macro_interpreter.h index 631146d89f..fb923f7b9f 100644 --- a/src/video_core/macro_interpreter.h +++ b/src/video_core/macro/macro_interpreter.h @@ -1,44 +1,37 @@ -// Copyright 2018 yuzu Emulator Project +// Copyright 2020 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. #pragma once - #include #include - +#include #include "common/bit_field.h" #include "common/common_types.h" +#include "video_core/macro/macro.h" namespace Tegra { namespace Engines { class Maxwell3D; } -class MacroInterpreter final { +class MacroInterpreter final : public MacroEngine { public: explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d); - /** - * Executes the macro code with the specified input parameters. - * @param offset Offset to start execution at. - * @param parameters The parameters of the macro. - */ - void Execute(u32 offset, std::size_t num_parameters, const u32* parameters); +protected: + std::unique_ptr Compile(const std::vector& code) override; private: - enum class ALUOperation : u32; - enum class BranchCondition : u32; - enum class ResultOperation : u32; + Engines::Maxwell3D& maxwell3d; +}; - union Opcode; - - union MethodAddress { - u32 raw; - BitField<0, 12, u32> address; - BitField<12, 6, u32> increment; - }; +class MacroInterpreterImpl : public CachedMacro { +public: + MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector& code); + void Execute(std::vector& parameters, u32 method) override; +private: /// Resets the execution engine state, zeroing registers, etc. void Reset(); @@ -49,20 +42,20 @@ private: * @param is_delay_slot Whether the current step is being executed due to a delay slot in a * previous instruction. */ - bool Step(u32 offset, bool is_delay_slot); + bool Step(bool is_delay_slot); /// Calculates the result of an ALU operation. src_a OP src_b; - u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b); + u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b); /// Performs the result operation on the input result and stores it in the specified register /// (if necessary). - void ProcessResult(ResultOperation operation, u32 reg, u32 result); + void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result); /// Evaluates the branch condition and returns whether the branch should be taken or not. - bool EvaluateBranchCondition(BranchCondition cond, u32 value) const; + bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const; /// Reads an opcode at the current program counter location. - Opcode GetOpcode(u32 offset) const; + Macro::Opcode GetOpcode() const; /// Returns the specified register's value. Register 0 is hardcoded to always return 0. u32 GetRegister(u32 register_id) const; @@ -89,13 +82,11 @@ private: /// Program counter to execute at after the delay slot is executed. std::optional delayed_pc; - static constexpr std::size_t NumMacroRegisters = 8; - /// General purpose macro registers. - std::array registers = {}; + std::array registers = {}; /// Method address to use for the next Send instruction. - MethodAddress method_address = {}; + Macro::MethodAddress method_address = {}; /// Input parameters of the current macro. std::unique_ptr parameters; @@ -105,5 +96,7 @@ private: u32 next_parameter_index = 0; bool carry_flag = false; + const std::vector& code; }; + } // namespace Tegra diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp new file mode 100644 index 0000000000..1b657236a6 --- /dev/null +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -0,0 +1,633 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/microprofile.h" +#include "common/x64/xbyak_util.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/macro/macro_interpreter.h" +#include "video_core/macro/macro_jit_x64.h" + +MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47)); +MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0)); + +namespace Tegra { +using JitFunction = void (MacroJITx64Impl::*)(Macro::Opcode opcode); +const std::array InstructionTable{ + &MacroJITx64Impl::Compile_ALU, + &MacroJITx64Impl::Compile_AddImmediate, + &MacroJITx64Impl::Compile_ExtractInsert, + &MacroJITx64Impl::Compile_ExtractShiftLeftImmediate, + &MacroJITx64Impl::Compile_ExtractShiftLeftRegister, + &MacroJITx64Impl::Compile_Read, + nullptr, + &MacroJITx64Impl::Compile_Branch, +}; + +static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9; +static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10; +static const Xbyak::Reg64 STATE = Xbyak::util::r11; +static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12; +static const Xbyak::Reg32 RESULT = Xbyak::util::r13d; +static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13; +static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; +static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14; +static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; + +static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ + PARAMETERS, + REGISTERS, + STATE, + NEXT_PARAMETER, + RESULT, + METHOD_ADDRESS, + BRANCH_HOLDER, +}); + +MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} + +std::unique_ptr MacroJITx64::Compile(const std::vector& code) { + return std::make_unique(maxwell3d, code); +} + +MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector& code) + : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) { + Compile(); +} + +MacroJITx64Impl::~MacroJITx64Impl() = default; + +void MacroJITx64Impl::Execute(std::vector& parameters, u32 method) { + MICROPROFILE_SCOPE(MacroJitExecute); + ASSERT_OR_EXECUTE(program != nullptr, { return; }); + JITState state{}; + state.maxwell3d = &maxwell3d; + state.registers = {}; + state.parameters = parameters.data(); + program(&state); +} + +void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { + const bool is_a_zero = opcode.src_a == 0; + const bool is_b_zero = opcode.src_b == 0; + const bool valid_operation = !is_a_zero && !is_b_zero; + const bool is_move_operation = !is_a_zero && is_b_zero; + const bool has_zero_register = is_a_zero || is_b_zero; + + Xbyak::Reg64 src_a; + Xbyak::Reg32 src_b; + + if (!optimizer.zero_reg_skip) { + src_a = Compile_GetRegister(opcode.src_a, RESULT_64); + src_b = Compile_GetRegister(opcode.src_b, ebx); + } else { + if (!is_a_zero) { + src_a = Compile_GetRegister(opcode.src_a, RESULT_64); + } + if (!is_b_zero) { + src_b = Compile_GetRegister(opcode.src_b, ebx); + } + } + Xbyak::Label skip_carry{}; + + bool has_emitted = false; + + switch (opcode.alu_operation) { + case Macro::ALUOperation::Add: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + add(src_a, src_b); + } + } else { + add(src_a, src_b); + } + + if (!optimizer.can_skip_carry) { + setc(byte[STATE + offsetof(JITState, carry_flag)]); + } + break; + case Macro::ALUOperation::AddWithCarry: + bt(dword[STATE + offsetof(JITState, carry_flag)], 0); + adc(src_a, src_b); + setc(byte[STATE + offsetof(JITState, carry_flag)]); + break; + case Macro::ALUOperation::Subtract: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + sub(src_a, src_b); + has_emitted = true; + } + } else { + sub(src_a, src_b); + has_emitted = true; + } + if (!optimizer.can_skip_carry && has_emitted) { + setc(byte[STATE + offsetof(JITState, carry_flag)]); + } + break; + case Macro::ALUOperation::SubtractWithBorrow: + bt(dword[STATE + offsetof(JITState, carry_flag)], 0); + sbb(src_a, src_b); + setc(byte[STATE + offsetof(JITState, carry_flag)]); + break; + case Macro::ALUOperation::Xor: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + xor_(src_a, src_b); + } + } else { + xor_(src_a, src_b); + } + break; + case Macro::ALUOperation::Or: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + or_(src_a, src_b); + } + } else { + or_(src_a, src_b); + } + break; + case Macro::ALUOperation::And: + if (optimizer.zero_reg_skip) { + if (!has_zero_register) { + and_(src_a, src_b); + } + } else { + and_(src_a, src_b); + } + break; + case Macro::ALUOperation::AndNot: + if (optimizer.zero_reg_skip) { + if (!is_a_zero) { + not_(src_b); + and_(src_a, src_b); + } + } else { + not_(src_b); + and_(src_a, src_b); + } + break; + case Macro::ALUOperation::Nand: + if (optimizer.zero_reg_skip) { + if (!is_a_zero) { + and_(src_a, src_b); + not_(src_a); + } + } else { + and_(src_a, src_b); + not_(src_a); + } + break; + default: + UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", + static_cast(opcode.alu_operation.Value())); + break; + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) { + if (optimizer.skip_dummy_addimmediate) { + // Games tend to use this as an exit instruction placeholder. It's to encode an instruction + // without doing anything. In our case we can just not emit anything. + if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) { + return; + } + } + // Check for redundant moves + if (optimizer.optimize_for_method_move && + opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) { + if (next_opcode.has_value()) { + const auto next = *next_opcode; + if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) { + return; + } + } + } + if (optimizer.zero_reg_skip && opcode.src_a == 0) { + if (opcode.immediate == 0) { + xor_(RESULT, RESULT); + } else { + mov(RESULT, opcode.immediate); + } + } else { + auto result = Compile_GetRegister(opcode.src_a, RESULT); + if (opcode.immediate > 2) { + add(result, opcode.immediate); + } else if (opcode.immediate == 1) { + inc(result); + } else if (opcode.immediate < 0) { + sub(result, opcode.immediate * -1); + } + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) { + auto dst = Compile_GetRegister(opcode.src_a, RESULT); + auto src = Compile_GetRegister(opcode.src_b, eax); + + if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) { + shr(src, opcode.bf_src_bit); + } else if (opcode.bf_src_bit == 31) { + xor_(src, src); + } + // Don't bother masking the whole register since we're using a 32 bit register + if (opcode.bf_size != 31 && opcode.bf_size != 0) { + and_(src, opcode.GetBitfieldMask()); + } else if (opcode.bf_size == 0) { + xor_(src, src); + } + if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) { + shl(src, opcode.bf_dst_bit); + } else if (opcode.bf_dst_bit == 31) { + xor_(src, src); + } + + const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit); + if (mask != 0xffffffff) { + and_(dst, mask); + } + or_(dst, src); + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { + auto dst = Compile_GetRegister(opcode.src_a, eax); + auto src = Compile_GetRegister(opcode.src_b, RESULT); + + shr(src, al); + if (opcode.bf_size != 0 && opcode.bf_size != 31) { + and_(src, opcode.GetBitfieldMask()); + } else if (opcode.bf_size == 0) { + xor_(src, src); + } + + if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) { + shl(src, opcode.bf_dst_bit); + } else if (opcode.bf_dst_bit == 31) { + xor_(src, src); + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { + auto dst = Compile_GetRegister(opcode.src_a, eax); + auto src = Compile_GetRegister(opcode.src_b, RESULT); + + if (opcode.bf_src_bit != 0) { + shr(src, opcode.bf_src_bit); + } + + if (opcode.bf_size != 31) { + and_(src, opcode.GetBitfieldMask()); + } + shl(src, al); + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) { + return maxwell3d->GetRegisterValue(method); +} + +static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) { + maxwell3d->CallMethodFromMME(method_address.address, value); +} + +void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { + if (optimizer.zero_reg_skip && opcode.src_a == 0) { + if (opcode.immediate == 0) { + xor_(RESULT, RESULT); + } else { + mov(RESULT, opcode.immediate); + } + } else { + auto result = Compile_GetRegister(opcode.src_a, RESULT); + if (opcode.immediate > 2) { + add(result, opcode.immediate); + } else if (opcode.immediate == 1) { + inc(result); + } else if (opcode.immediate < 0) { + sub(result, opcode.immediate * -1); + } + } + Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + mov(Common::X64::ABI_PARAM1, qword[STATE]); + mov(Common::X64::ABI_PARAM2, RESULT); + Common::X64::CallFarFunction(*this, &Read); + Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + mov(RESULT, Common::X64::ABI_RETURN.cvt32()); + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) { + Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + mov(Common::X64::ABI_PARAM1, qword[STATE]); + mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS); + mov(Common::X64::ABI_PARAM3, value); + Common::X64::CallFarFunction(*this, &Send); + Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + + Xbyak::Label dont_process{}; + // Get increment + test(METHOD_ADDRESS, 0x3f000); + // If zero, method address doesn't update + je(dont_process); + + mov(ecx, METHOD_ADDRESS); + and_(METHOD_ADDRESS, 0xfff); + shr(ecx, 12); + and_(ecx, 0x3f); + lea(eax, ptr[rcx + METHOD_ADDRESS_64]); + sal(ecx, 12); + or_(eax, ecx); + + mov(METHOD_ADDRESS, eax); + + L(dont_process); +} + +void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) { + ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); + const s32 jump_address = + static_cast(pc) + static_cast(opcode.GetBranchTarget() / sizeof(s32)); + + Xbyak::Label end; + auto value = Compile_GetRegister(opcode.src_a, eax); + test(value, value); + if (optimizer.has_delayed_pc) { + switch (opcode.branch_condition) { + case Macro::BranchCondition::Zero: + jne(end, T_NEAR); + break; + case Macro::BranchCondition::NotZero: + je(end, T_NEAR); + break; + } + + if (opcode.branch_annul) { + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(labels[jump_address], T_NEAR); + } else { + Xbyak::Label handle_post_exit{}; + Xbyak::Label skip{}; + jmp(skip, T_NEAR); + if (opcode.is_exit) { + L(handle_post_exit); + // Execute 1 instruction + mov(BRANCH_HOLDER, end_of_code); + // Jump to next instruction to skip delay slot check + jmp(labels[jump_address], T_NEAR); + } else { + L(handle_post_exit); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(labels[jump_address], T_NEAR); + } + L(skip); + mov(BRANCH_HOLDER, handle_post_exit); + jmp(delay_skip[pc], T_NEAR); + } + } else { + switch (opcode.branch_condition) { + case Macro::BranchCondition::Zero: + je(labels[jump_address], T_NEAR); + break; + case Macro::BranchCondition::NotZero: + jne(labels[jump_address], T_NEAR); + break; + } + } + + L(end); +} + +void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() { + optimizer.can_skip_carry = true; + optimizer.has_delayed_pc = false; + for (auto raw_op : code) { + Macro::Opcode op{}; + op.raw = raw_op; + + if (op.operation == Macro::Operation::ALU) { + // Scan for any ALU operations which actually use the carry flag, if they don't exist in + // our current code we can skip emitting the carry flag handling operations + if (op.alu_operation == Macro::ALUOperation::AddWithCarry || + op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) { + optimizer.can_skip_carry = false; + } + } + + if (op.operation == Macro::Operation::Branch) { + if (!op.branch_annul) { + optimizer.has_delayed_pc = true; + } + } + } +} + +void MacroJITx64Impl::Compile() { + MICROPROFILE_SCOPE(MacroJitCompile); + bool keep_executing = true; + labels.fill(Xbyak::Label()); + + Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + // JIT state + mov(STATE, Common::X64::ABI_PARAM1); + mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 + + static_cast(offsetof(JITState, parameters))]); + mov(REGISTERS, Common::X64::ABI_PARAM1); + add(REGISTERS, static_cast(offsetof(JITState, registers))); + xor_(RESULT, RESULT); + xor_(METHOD_ADDRESS, METHOD_ADDRESS); + xor_(NEXT_PARAMETER, NEXT_PARAMETER); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + + mov(dword[REGISTERS + 4], Compile_FetchParameter()); + + // Track get register for zero registers and mark it as no-op + optimizer.zero_reg_skip = true; + + // AddImmediate tends to be used as a NOP instruction, if we detect this we can + // completely skip the entire code path and no emit anything + optimizer.skip_dummy_addimmediate = true; + + // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting + // one if our register isn't "dirty" + optimizer.optimize_for_method_move = true; + + // Check to see if we can skip emitting certain instructions + Optimizer_ScanFlags(); + + const u32 op_count = static_cast(code.size()); + for (u32 i = 0; i < op_count; i++) { + if (i < op_count - 1) { + pc = i + 1; + next_opcode = GetOpCode(); + } else { + next_opcode = {}; + } + pc = i; + Compile_NextInstruction(); + } + + L(end_of_code); + + Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + ret(); + ready(); + program = getCode(); +} + +bool MacroJITx64Impl::Compile_NextInstruction() { + const auto opcode = GetOpCode(); + if (labels[pc].getAddress()) { + return false; + } + + L(labels[pc]); + + const std::size_t op = static_cast(opcode.operation.Value()); + + if (InstructionTable[op] == nullptr) { + UNIMPLEMENTED_MSG("Unimplemented opcode {}", op); + } else { + ((*this).*InstructionTable[op])(opcode); + } + + if (optimizer.has_delayed_pc) { + if (opcode.is_exit) { + mov(rax, end_of_code); + test(BRANCH_HOLDER, BRANCH_HOLDER); + cmove(BRANCH_HOLDER, rax); + // Jump to next instruction to skip delay slot check + je(labels[pc + 1], T_NEAR); + } else { + // TODO(ogniK): Optimize delay slot branching + Xbyak::Label no_delay_slot{}; + test(BRANCH_HOLDER, BRANCH_HOLDER); + je(no_delay_slot, T_NEAR); + mov(rax, BRANCH_HOLDER); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(rax); + L(no_delay_slot); + } + L(delay_skip[pc]); + if (opcode.is_exit) { + return false; + } + } else { + test(BRANCH_HOLDER, BRANCH_HOLDER); + jne(end_of_code, T_NEAR); + if (opcode.is_exit) { + inc(BRANCH_HOLDER); + return false; + } + } + return true; +} + +Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() { + mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]); + inc(NEXT_PARAMETER); + return eax; +} + +Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { + if (index == 0) { + // Register 0 is always zero + xor_(dst, dst); + } else { + mov(dst, dword[REGISTERS + index * sizeof(u32)]); + } + + return dst; +} + +Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) { + if (index == 0) { + // Register 0 is always zero + xor_(dst, dst); + } else { + mov(dst, dword[REGISTERS + index * sizeof(u32)]); + } + + return dst; +} + +void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) { + Xbyak::Label zero{}, end{}; + xor_(ecx, ecx); + shr(dst, 32); + setne(cl); + mov(dword[STATE + offsetof(JITState, carry_flag)], ecx); +} + +void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { + auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) { + // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero + // register. + if (reg == 0) { + return; + } + mov(dword[REGISTERS + reg * sizeof(u32)], result); + }; + auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); }; + + switch (operation) { + case Macro::ResultOperation::IgnoreAndFetch: + SetRegister(reg, Compile_FetchParameter()); + break; + case Macro::ResultOperation::Move: + SetRegister(reg, RESULT); + break; + case Macro::ResultOperation::MoveAndSetMethod: + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + break; + case Macro::ResultOperation::FetchAndSend: + // Fetch parameter and send result. + SetRegister(reg, Compile_FetchParameter()); + Compile_Send(RESULT); + break; + case Macro::ResultOperation::MoveAndSend: + // Move and send result. + SetRegister(reg, RESULT); + Compile_Send(RESULT); + break; + case Macro::ResultOperation::FetchAndSetMethod: + // Fetch parameter and use result as Method Address. + SetRegister(reg, Compile_FetchParameter()); + SetMethodAddress(RESULT); + break; + case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: + // Move result and use as Method Address, then fetch and send parameter. + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + Compile_Send(Compile_FetchParameter()); + break; + case Macro::ResultOperation::MoveAndSetMethodSend: + // Move result and use as Method Address, then send bits 12:17 of result. + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + shr(RESULT, 12); + and_(RESULT, 0b111111); + Compile_Send(RESULT); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast(operation)); + } +} + +Macro::Opcode MacroJITx64Impl::GetOpCode() const { + ASSERT(pc < code.size()); + return {code[pc]}; +} + +std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const { + return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED; +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h new file mode 100644 index 0000000000..71cd6a3b01 --- /dev/null +++ b/src/video_core/macro/macro_jit_x64.h @@ -0,0 +1,98 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include "common/bit_field.h" +#include "common/common_types.h" +#include "common/x64/xbyak_abi.h" +#include "video_core/macro/macro.h" + +namespace Tegra { +namespace Engines { +class Maxwell3D; +} + +/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games +constexpr size_t MAX_CODE_SIZE = 0x10000; + +class MacroJITx64 final : public MacroEngine { +public: + explicit MacroJITx64(Engines::Maxwell3D& maxwell3d); + +protected: + std::unique_ptr Compile(const std::vector& code) override; + +private: + Engines::Maxwell3D& maxwell3d; +}; + +class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro { +public: + MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector& code); + ~MacroJITx64Impl(); + void Execute(std::vector& parameters, u32 method) override; + + void Compile_ALU(Macro::Opcode opcode); + void Compile_AddImmediate(Macro::Opcode opcode); + void Compile_ExtractInsert(Macro::Opcode opcode); + void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode); + void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode); + void Compile_Read(Macro::Opcode opcode); + void Compile_Branch(Macro::Opcode opcode); + +private: + void Optimizer_ScanFlags(); + + void Compile(); + bool Compile_NextInstruction(); + + Xbyak::Reg32 Compile_FetchParameter(); + Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst); + Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst); + void Compile_WriteCarry(Xbyak::Reg64 dst); + + void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg); + void Compile_Send(Xbyak::Reg32 value); + + Macro::Opcode GetOpCode() const; + std::bitset<32> PersistentCallerSavedRegs() const; + + struct JITState { + Engines::Maxwell3D* maxwell3d{}; + std::array registers{}; + u32* parameters{}; + u32 carry_flag{}; + }; + static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0"); + using ProgramType = void (*)(JITState*); + + struct OptimizerState { + bool can_skip_carry{}; + bool has_delayed_pc{}; + bool zero_reg_skip{}; + bool skip_dummy_addimmediate{}; + bool optimize_for_method_move{}; + }; + OptimizerState optimizer{}; + + std::optional next_opcode{}; + ProgramType program{nullptr}; + + std::array labels; + std::array delay_skip{}; + Xbyak::Label end_of_code{}; + + bool is_delay_slot{}; + u32 pc{}; + std::optional delayed_pc; + + const std::vector& code; + Engines::Maxwell3D& maxwell3d; +}; + +} // namespace Tegra diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index b08b874265..7e9073cc36 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -533,6 +533,8 @@ void Config::ReadDebuggingValues() { Settings::values.quest_flag = ReadSetting(QStringLiteral("quest_flag"), false).toBool(); Settings::values.disable_cpu_opt = ReadSetting(QStringLiteral("disable_cpu_opt"), false).toBool(); + Settings::values.disable_macro_jit = + ReadSetting(QStringLiteral("disable_macro_jit"), false).toBool(); qt_config->endGroup(); } @@ -1011,6 +1013,7 @@ void Config::SaveDebuggingValues() { WriteSetting(QStringLiteral("dump_nso"), Settings::values.dump_nso, false); WriteSetting(QStringLiteral("quest_flag"), Settings::values.quest_flag, false); WriteSetting(QStringLiteral("disable_cpu_opt"), Settings::values.disable_cpu_opt, false); + WriteSetting(QStringLiteral("disable_macro_jit"), Settings::values.disable_macro_jit, false); qt_config->endGroup(); } diff --git a/src/yuzu/configuration/configure_debug.cpp b/src/yuzu/configuration/configure_debug.cpp index c2026763e6..2c77441fdf 100644 --- a/src/yuzu/configuration/configure_debug.cpp +++ b/src/yuzu/configuration/configure_debug.cpp @@ -39,6 +39,8 @@ void ConfigureDebug::SetConfiguration() { ui->disable_cpu_opt->setChecked(Settings::values.disable_cpu_opt); ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn()); ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug); + ui->disable_macro_jit->setEnabled(!Core::System::GetInstance().IsPoweredOn()); + ui->disable_macro_jit->setChecked(Settings::values.disable_macro_jit); } void ConfigureDebug::ApplyConfiguration() { @@ -51,6 +53,7 @@ void ConfigureDebug::ApplyConfiguration() { Settings::values.quest_flag = ui->quest_flag->isChecked(); Settings::values.disable_cpu_opt = ui->disable_cpu_opt->isChecked(); Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked(); + Settings::values.disable_macro_jit = ui->disable_macro_jit->isChecked(); Debugger::ToggleConsole(); Log::Filter filter; filter.ParseFilterString(Settings::values.log_filter); diff --git a/src/yuzu/configuration/configure_debug.ui b/src/yuzu/configuration/configure_debug.ui index e0d4c4a449..46f0208c60 100644 --- a/src/yuzu/configuration/configure_debug.ui +++ b/src/yuzu/configuration/configure_debug.ui @@ -148,6 +148,19 @@ + + + + true + + + When checked, it disables the macro Just In Time compiler. Enabled this makes games run slower + + + Disable Macro JIT + + + diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index c20d48c429..7240270f50 100644 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -432,6 +432,8 @@ void Config::ReadValues() { Settings::values.quest_flag = sdl2_config->GetBoolean("Debugging", "quest_flag", false); Settings::values.disable_cpu_opt = sdl2_config->GetBoolean("Debugging", "disable_cpu_opt", false); + Settings::values.disable_macro_jit = + sdl2_config->GetBoolean("Debugging", "disable_macro_jit", false); const auto title_list = sdl2_config->Get("AddOns", "title_ids", ""); std::stringstream ss(title_list); diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h index abc6e6e656..6f53e96596 100644 --- a/src/yuzu_cmd/default_ini.h +++ b/src/yuzu_cmd/default_ini.h @@ -291,6 +291,8 @@ quest_flag = # Determines whether or not JIT CPU optimizations are enabled # false: Optimizations Enabled, true: Optimizations Disabled disable_cpu_opt = +# Enables/Disables the macro JIT compiler +disable_macro_jit=false [WebService] # Whether or not to enable telemetry From 8118ea160b194fbcc600c44bff3be556b249c780 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Sat, 30 May 2020 12:23:58 +1000 Subject: [PATCH 019/145] Favor switch case over jump table Easier to read and will emit a jump table automatically. --- src/video_core/macro/macro_jit_x64.cpp | 43 +++++++++++++++----------- src/video_core/macro/macro_jit_x64.h | 1 + 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 1b657236a6..48501e582c 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -14,18 +14,6 @@ MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255 MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0)); namespace Tegra { -using JitFunction = void (MacroJITx64Impl::*)(Macro::Opcode opcode); -const std::array InstructionTable{ - &MacroJITx64Impl::Compile_ALU, - &MacroJITx64Impl::Compile_AddImmediate, - &MacroJITx64Impl::Compile_ExtractInsert, - &MacroJITx64Impl::Compile_ExtractShiftLeftImmediate, - &MacroJITx64Impl::Compile_ExtractShiftLeftRegister, - &MacroJITx64Impl::Compile_Read, - nullptr, - &MacroJITx64Impl::Compile_Branch, -}; - static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9; static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10; static const Xbyak::Reg64 STATE = Xbyak::util::r11; @@ -489,12 +477,31 @@ bool MacroJITx64Impl::Compile_NextInstruction() { L(labels[pc]); - const std::size_t op = static_cast(opcode.operation.Value()); - - if (InstructionTable[op] == nullptr) { - UNIMPLEMENTED_MSG("Unimplemented opcode {}", op); - } else { - ((*this).*InstructionTable[op])(opcode); + switch (opcode.operation) { + case Macro::Operation::ALU: + Compile_ALU(opcode); + break; + case Macro::Operation::AddImmediate: + Compile_AddImmediate(opcode); + break; + case Macro::Operation::ExtractInsert: + Compile_ExtractInsert(opcode); + break; + case Macro::Operation::ExtractShiftLeftImmediate: + Compile_ExtractShiftLeftImmediate(opcode); + break; + case Macro::Operation::ExtractShiftLeftRegister: + Compile_ExtractShiftLeftRegister(opcode); + break; + case Macro::Operation::Read: + Compile_Read(opcode); + break; + case Macro::Operation::Branch: + Compile_Branch(opcode); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value()); + break; } if (optimizer.has_delayed_pc) { diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index 71cd6a3b01..729ed77137 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -35,6 +35,7 @@ class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro { public: MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector& code); ~MacroJITx64Impl(); + void Execute(std::vector& parameters, u32 method) override; void Compile_ALU(Macro::Opcode opcode); From 1ee1a5d3d64379bf2463c072a32af8e64a8c14cf Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 29 May 2020 23:52:47 -0300 Subject: [PATCH 020/145] texture_cache: More relaxed reconstruction Only reupload textures when they've not been modified from the GPU. --- src/video_core/texture_cache/texture_cache.h | 22 ++++++++------------ 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 3e024a0981..4ba0d2c3a3 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -656,19 +656,19 @@ private: if (params.target == SurfaceTarget::Texture3D) { return std::nullopt; } + const auto test_modified = [](TSurface& surface) { return surface->IsModified(); }; TSurface new_surface = GetUncachedSurface(gpu_addr, params); - std::size_t passed_tests = 0; - bool modified = false; - u32 num_resources = 0; - for (auto& surface : overlaps) { - const SurfaceParams& src_params = surface->GetSurfaceParams(); - num_resources += src_params.depth * src_params.num_levels; - } - if (num_resources != params.depth * params.num_levels) { + if (std::none_of(overlaps.begin(), overlaps.end(), test_modified)) { LoadSurface(new_surface); + for (const auto& surface : overlaps) { + Unregister(surface); + } + Register(new_surface); + return {{new_surface, new_surface->GetMainView()}}; } + std::size_t passed_tests = 0; for (auto& surface : overlaps) { const SurfaceParams& src_params = surface->GetSurfaceParams(); const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())}; @@ -681,11 +681,6 @@ private: } ++passed_tests; - if (!surface->IsModified()) { - continue; - } - modified = true; - // Copy all mipmaps and layers const u32 block_width = params.GetDefaultBlockWidth(); const u32 block_height = params.GetDefaultBlockHeight(); @@ -709,6 +704,7 @@ private: return std::nullopt; } + const bool modified = std::any_of(overlaps.begin(), overlaps.end(), test_modified); for (const auto& surface : overlaps) { Unregister(surface); } From f2d1aa97ad52f5f78ae30b70c59559bfe0b950c5 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 29 May 2020 20:21:48 -0300 Subject: [PATCH 021/145] shader/other: Fix hardcoded value in S2R INVOCATION_INFO Geometry shaders built from Nvidia's compiler check for bits[16:23] to be less than or equal to 0 with VSETP to default to a "safe" value of 0x8000'0000 (safe from hardware's perspective). To avoid hitting this path in the shader, return 0x00ff'0000 from S2R INVOCATION_INFO. This seems to be the maximum number of vertices a geometry shader can emit in a primitive. --- src/video_core/shader/decode/other.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp index 694b325e14..045f5abb50 100644 --- a/src/video_core/shader/decode/other.cpp +++ b/src/video_core/shader/decode/other.cpp @@ -83,7 +83,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { return Operation(OperationCode::YNegate); case SystemVariable::InvocationInfo: LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete"); - return Immediate(0U); + return Immediate(0x00ff'0000U); case SystemVariable::WscaleFactorXY: UNIMPLEMENTED_MSG("S2R WscaleFactorXY is not implemented"); return Immediate(0U); From bb8ef381526cd9cc0a52f00383c755913a20aa43 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Sun, 31 May 2020 03:12:21 -0400 Subject: [PATCH 022/145] gl_device: Enable compute shaders for Intel proprietary drivers Previously we were disabling compute shaders on Intel's proprietary driver due to broken compute. This has been fixed in the latest Intel drivers. Re-enable compute for Intel proprietary drivers and remove the check for broken compute. --- src/video_core/renderer_opengl/gl_device.cpp | 4 ---- src/video_core/renderer_opengl/gl_device.h | 5 ----- src/video_core/renderer_opengl/gl_rasterizer.cpp | 4 ---- 3 files changed, 13 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 466a911db3..e1b2452880 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -166,8 +166,6 @@ Device::Device() : base_bindings{BuildBaseBindings()} { const bool is_nvidia = vendor == "NVIDIA Corporation"; const bool is_amd = vendor == "ATI Technologies Inc."; - const bool is_intel = vendor == "Intel"; - const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr; uniform_buffer_alignment = GetInteger(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); shader_storage_alignment = GetInteger(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); @@ -182,7 +180,6 @@ Device::Device() : base_bindings{BuildBaseBindings()} { has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = is_amd; has_precise_bug = TestPreciseBug(); - has_broken_compute = is_intel_proprietary; has_fast_buffer_sub_data = is_nvidia; use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5; @@ -206,7 +203,6 @@ Device::Device(std::nullptr_t) { has_image_load_formatted = true; has_variable_aoffi = true; has_component_indexing_bug = false; - has_broken_compute = false; has_precise_bug = false; } diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index e915dbd866..683ed90029 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -80,10 +80,6 @@ public: return has_precise_bug; } - bool HasBrokenCompute() const { - return has_broken_compute; - } - bool HasFastBufferSubData() const { return has_fast_buffer_sub_data; } @@ -109,7 +105,6 @@ private: bool has_variable_aoffi{}; bool has_component_indexing_bug{}; bool has_precise_bug{}; - bool has_broken_compute{}; bool has_fast_buffer_sub_data{}; bool use_assembly_shaders{}; }; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 716d43e65d..61cf99b9de 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -655,10 +655,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { } void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { - if (device.HasBrokenCompute()) { - return; - } - buffer_cache.Acquire(); current_cbuf = 0; From 0ac8848eae50d79ea43adc8a15be0db4fb4b1cf9 Mon Sep 17 00:00:00 2001 From: bunnei Date: Sun, 31 May 2020 17:46:25 -0400 Subject: [PATCH 023/145] Update CMakeLists.txt --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 61321bf0a0..ac6b21f742 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.11) +cmake_minimum_required(VERSION 3.15) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/cmake-modules") From 104b334e40d5c5d00963a3442255cae9459e4b73 Mon Sep 17 00:00:00 2001 From: bunnei Date: Sun, 31 May 2020 18:35:36 -0400 Subject: [PATCH 024/145] Update CMakeLists.txt --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ac6b21f742..a9f669d568 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,7 @@ project(yuzu) option(ENABLE_SDL2 "Enable the SDL2 frontend" ON) option(ENABLE_QT "Enable the Qt frontend" ON) -CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" OFF "ENABLE_QT;MSVC" OFF) +CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" ON "ENABLE_QT;MSVC" OFF) option(ENABLE_WEB_SERVICE "Enable web services (telemetry, etc.)" ON) From ee21e4ecd372d6a191244dffbc5ac5c7b80150c1 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 28 May 2020 17:06:22 -0300 Subject: [PATCH 025/145] glsl: Squash constant buffers into a single SSBO when we hit the limit Avoids compilation errors at the cost of shader build times and runtime performance when a game hits the limit of uniform buffers we can use. --- src/video_core/renderer_opengl/gl_device.cpp | 46 +++++---- src/video_core/renderer_opengl/gl_device.h | 7 +- .../renderer_opengl/gl_rasterizer.cpp | 78 +++++++++++---- .../renderer_opengl/gl_rasterizer.h | 4 +- .../renderer_opengl/gl_shader_cache.cpp | 12 ++- .../renderer_opengl/gl_shader_decompiler.cpp | 99 ++++++++++++------- .../renderer_opengl/gl_shader_decompiler.h | 6 +- 7 files changed, 173 insertions(+), 79 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index e1b2452880..d20547c044 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -26,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1; constexpr u32 NumStages = 5; -constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, - GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, - GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS}; +constexpr std::array LimitUBOs = { + GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, + GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS, + GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS}; constexpr std::array LimitSSBOs = { - GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, + GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS, - GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS}; + GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS}; -constexpr std::array LimitSamplers = { - GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, - GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, - GL_MAX_TEXTURE_IMAGE_UNITS}; +constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, + GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, + GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, + GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, + GL_MAX_TEXTURE_IMAGE_UNITS, + GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS}; -constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS, - GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, - GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, - GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS}; +constexpr std::array LimitImages = { + GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, + GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS, + GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS}; template T GetInteger(GLenum pname) { @@ -85,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional limit = {}) { return std::exchange(base, base + amount); } +std::array BuildMaxUniformBuffers() noexcept { + std::array max; + std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(), + [](GLenum pname) { return GetInteger(pname); }); + return max; +} + std::array BuildBaseBindings() noexcept { std::array bindings; @@ -159,7 +170,8 @@ bool IsASTCSupported() { } // Anonymous namespace -Device::Device() : base_bindings{BuildBaseBindings()} { +Device::Device() + : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { const std::string_view vendor = reinterpret_cast(glGetString(GL_VENDOR)); const auto renderer = reinterpret_cast(glGetString(GL_RENDERER)); const std::vector extensions = GetExtensions(); @@ -194,7 +206,9 @@ Device::Device() : base_bindings{BuildBaseBindings()} { } Device::Device(std::nullptr_t) { - uniform_buffer_alignment = 0; + max_uniform_buffers.fill(std::numeric_limits::max()); + uniform_buffer_alignment = 4; + shader_storage_alignment = 4; max_vertex_attributes = 16; max_varyings = 15; has_warp_intrinsics = true; @@ -202,8 +216,6 @@ Device::Device(std::nullptr_t) { has_vertex_viewport_layer = true; has_image_load_formatted = true; has_variable_aoffi = true; - has_component_indexing_bug = false; - has_precise_bug = false; } bool Device::TestVariableAoffi() { diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 683ed90029..98cca02543 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -24,6 +24,10 @@ public: explicit Device(); explicit Device(std::nullptr_t); + u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept { + return max_uniform_buffers[static_cast(shader_type)]; + } + const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept { return base_bindings[stage_index]; } @@ -92,7 +96,8 @@ private: static bool TestVariableAoffi(); static bool TestPreciseBug(); - std::array base_bindings; + std::array max_uniform_buffers{}; + std::array base_bindings{}; std::size_t uniform_buffer_alignment{}; std::size_t shader_storage_alignment{}; u32 max_vertex_attributes{}; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 61cf99b9de..b0f303a3cc 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -54,6 +54,12 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255 namespace { +constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18; +constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE = + NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize; +constexpr std::size_t TOTAL_CONST_BUFFER_BYTES = + NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage; + constexpr std::size_t NumSupportedVertexAttributes = 16; template @@ -104,6 +110,9 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} { CheckExtensions(); + unified_uniform_buffer.Create(); + glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0); + if (device.UseAssemblyShaders()) { glCreateBuffers(static_cast(staging_cbufs.size()), staging_cbufs.data()); for (const GLuint cbuf : staging_cbufs) { @@ -842,34 +851,56 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad MICROPROFILE_SCOPE(OpenGL_UBO); const auto& stages = system.GPU().Maxwell3D().state.shader_stages; const auto& shader_stage = stages[stage_index]; + const auto& entries = shader->GetEntries(); + const bool use_unified = entries.use_unified_uniforms; + const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE; - u32 binding = - device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer; - for (const auto& entry : shader->GetEntries().const_buffers) { - const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; - SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry); + const auto base_bindings = device.GetBaseBindings(stage_index); + u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer; + for (const auto& entry : entries.const_buffers) { + const u32 index = entry.GetIndex(); + const auto& buffer = shader_stage.const_buffers[index]; + SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified, + base_unified_offset + index * Maxwell::MaxConstBufferSize); + ++binding; + } + if (use_unified) { + const u32 index = static_cast(base_bindings.shader_storage_buffer + + entries.global_memory_entries.size()); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, + base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE); } } void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { MICROPROFILE_SCOPE(OpenGL_UBO); const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + const auto& entries = kernel->GetEntries(); + const bool use_unified = entries.use_unified_uniforms; u32 binding = 0; - for (const auto& entry : kernel->GetEntries().const_buffers) { + for (const auto& entry : entries.const_buffers) { const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); Tegra::Engines::ConstBufferInfo buffer; buffer.address = config.Address(); buffer.size = config.size; buffer.enabled = mask[entry.GetIndex()]; - SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry); + SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry, + use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize); + ++binding; + } + if (use_unified) { + const GLuint index = static_cast(entries.global_memory_entries.size()); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0, + NUM_CONST_BUFFERS_BYTES_PER_STAGE); } } void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry) { + const ConstBufferEntry& entry, bool use_unified, + std::size_t unified_offset) { if (!buffer.enabled) { // Set values to zero to unbind buffers if (device.UseAssemblyShaders()) { @@ -885,20 +916,29 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, // UBO alignment requirements. const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); - const auto alignment = device.GetUniformBufferAlignment(); - auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, - device.HasFastBufferSubData()); - if (!device.UseAssemblyShaders()) { - glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); + const bool fast_upload = !use_unified && device.HasFastBufferSubData(); + + const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment(); + const GPUVAddr gpu_addr = buffer.address; + auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); + + if (device.UseAssemblyShaders()) { + UNIMPLEMENTED_IF(use_unified); + if (offset != 0) { + const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; + glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); + cbuf = staging_cbuf; + offset = 0; + } + glBindBufferRangeNV(stage, binding, cbuf, offset, size); return; } - if (offset != 0) { - const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; - glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); - cbuf = staging_cbuf; - offset = 0; + + if (use_unified) { + glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size); + } else { + glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); } - glBindBufferRangeNV(stage, binding, cbuf, offset, size); } void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 87f7fe159f..f5dc56a0e2 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -107,7 +107,8 @@ private: /// Configures a constant buffer. void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry); + const ConstBufferEntry& entry, bool use_unified, + std::size_t unified_offset); /// Configures the current global memory entries to use for the draw command. void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); @@ -253,6 +254,7 @@ private: Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; std::array staging_cbufs{}; std::size_t current_cbuf = 0; + OGLBuffer unified_uniform_buffer; /// Number of commands queued to the OpenGL driver. Reseted on flush. std::size_t num_queued_commands = 0; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 4cd0f36cff..a991ca64a7 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -241,8 +241,9 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, entry.bindless_samplers = registry->GetBindlessSamplers(); params.disk_cache.SaveEntry(std::move(entry)); - return std::shared_ptr(new CachedShader( - params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); + return std::shared_ptr( + new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), + MakeEntries(params.device, ir, shader_type), std::move(program))); } Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { @@ -265,8 +266,9 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog entry.bindless_samplers = registry->GetBindlessSamplers(); params.disk_cache.SaveEntry(std::move(entry)); - return std::shared_ptr(new CachedShader( - params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); + return std::shared_ptr( + new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), + MakeEntries(params.device, ir, ShaderType::Compute), std::move(program))); } Shader CachedShader::CreateFromCache(const ShaderParameters& params, @@ -348,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, PrecompiledShader shader; shader.program = std::move(program); shader.registry = std::move(registry); - shader.entries = MakeEntries(ir); + shader.entries = MakeEntries(device, ir, entry.type); std::scoped_lock lock{mutex}; if (callback) { diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 9cb1159599..502b95973a 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -61,8 +61,8 @@ struct TextureDerivates {}; using TextureArgument = std::pair; using TextureIR = std::variant; -constexpr u32 MAX_CONSTBUFFER_ELEMENTS = - static_cast(Maxwell::MaxConstBufferSize) / (4 * sizeof(float)); +constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast(Maxwell::MaxConstBufferSize) / sizeof(u32); +constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32); constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt #define ftou floatBitsToUint @@ -402,6 +402,13 @@ std::string FlowStackTopName(MetaStackClass stack) { return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); } +bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) { + const u32 num_ubos = static_cast(ir.GetConstantBuffers().size()); + // We waste one UBO for emulation + const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1; + return num_ubos > num_available_ubos; +} + struct GenericVaryingDescription { std::string name; u8 first_element = 0; @@ -412,8 +419,9 @@ class GLSLDecompiler final { public: explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, ShaderType stage, std::string_view identifier, std::string_view suffix) - : device{device}, ir{ir}, registry{registry}, stage{stage}, - identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} { + : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier}, + suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{ + UseUnifiedUniforms(device, ir, stage)} { if (stage != ShaderType::Compute) { transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); } @@ -834,12 +842,24 @@ private: } void DeclareConstantBuffers() { + if (use_unified_uniforms) { + const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer + + static_cast(ir.GetGlobalMemory().size()); + code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{", + binding); + code.AddLine(" uint cbufs[];"); + code.AddLine("}};"); + code.AddNewLine(); + return; + } + u32 binding = device.GetBaseBindings(stage).uniform_buffer; - for (const auto& buffers : ir.GetConstantBuffers()) { - const auto index = buffers.first; + for (const auto [index, info] : ir.GetConstantBuffers()) { + const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4; + const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements; code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++, GetConstBufferBlock(index)); - code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS); + code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), size); code.AddLine("}};"); code.AddNewLine(); } @@ -1038,42 +1058,51 @@ private: if (const auto cbuf = std::get_if(&*node)) { const Node offset = cbuf->GetOffset(); + const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS; + if (const auto immediate = std::get_if(&*offset)) { // Direct access const u32 offset_imm = immediate->GetValue(); ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); - return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), - offset_imm / (4 * 4), (offset_imm / 4) % 4), + if (use_unified_uniforms) { + return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4), + Type::Uint}; + } else { + return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), + offset_imm / (4 * 4), (offset_imm / 4) % 4), + Type::Uint}; + } + } + + // Indirect access + if (use_unified_uniforms) { + return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset, + Visit(offset).AsUint()), Type::Uint}; } - if (std::holds_alternative(*offset)) { - // Indirect access - const std::string final_offset = code.GenerateTemporary(); - code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); + const std::string final_offset = code.GenerateTemporary(); + code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); - if (!device.HasComponentIndexingBug()) { - return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), - final_offset, final_offset), - Type::Uint}; - } - - // AMD's proprietary GLSL compiler emits ill code for variable component access. - // To bypass this driver bug generate 4 ifs, one per each component. - const std::string pack = code.GenerateTemporary(); - code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), - final_offset); - - const std::string result = code.GenerateTemporary(); - code.AddLine("uint {};", result); - for (u32 swizzle = 0; swizzle < 4; ++swizzle) { - code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, - pack, GetSwizzle(swizzle)); - } - return {result, Type::Uint}; + if (!device.HasComponentIndexingBug()) { + return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), + final_offset, final_offset), + Type::Uint}; } - UNREACHABLE_MSG("Unmanaged offset node type"); + // AMD's proprietary GLSL compiler emits ill code for variable component access. + // To bypass this driver bug generate 4 ifs, one per each component. + const std::string pack = code.GenerateTemporary(); + code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), + final_offset); + + const std::string result = code.GenerateTemporary(); + code.AddLine("uint {};", result); + for (u32 swizzle = 0; swizzle < 4; ++swizzle) { + code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack, + GetSwizzle(swizzle)); + } + return {result, Type::Uint}; } if (const auto gmem = std::get_if(&*node)) { @@ -2710,6 +2739,7 @@ private: const std::string_view identifier; const std::string_view suffix; const Header header; + const bool use_unified_uniforms; std::unordered_map transform_feedback; ShaderWriter code; @@ -2905,7 +2935,7 @@ void GLSLDecompiler::DecompileAST() { } // Anonymous namespace -ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { +ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) { ShaderEntries entries; for (const auto& cbuf : ir.GetConstantBuffers()) { entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(), @@ -2926,6 +2956,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; } entries.shader_length = ir.GetLength(); + entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage); return entries; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index e8a1787642..451c9689af 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -53,11 +53,13 @@ struct ShaderEntries { std::vector global_memory_entries; std::vector samplers; std::vector images; - u32 clip_distances{}; std::size_t shader_length{}; + u32 clip_distances{}; + bool use_unified_uniforms{}; }; -ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir); +ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + Tegra::Engines::ShaderType stage); std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, const VideoCommon::Shader::Registry& registry, From 0ee310ebdc3f9abf78aaa88bdbf1eeaa3a8f0ebd Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 29 May 2020 04:21:57 -0300 Subject: [PATCH 026/145] gl_device: Avoid devices with CAVEAT_SUPPORT on ASTC This avoids using Nvidia's ASTC decoder on OpenGL. The last time it was profiled, it was slower than yuzu's decoder. While we are at it, fix a bug in the texture cache when native ASTC is not supported. --- src/video_core/renderer_opengl/gl_device.cpp | 24 ++++++++++++++----- .../renderer_opengl/gl_texture_cache.cpp | 3 +-- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 466a911db3..ff35d362dc 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -133,6 +133,7 @@ std::array BuildBaseBindin } bool IsASTCSupported() { + static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY}; static constexpr std::array formats = { GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_COMPRESSED_RGBA_ASTC_6x5_KHR, @@ -149,12 +150,23 @@ bool IsASTCSupported() { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR, }; - return std::find_if_not(formats.begin(), formats.end(), [](GLenum format) { - GLint supported; - glGetInternalformativ(GL_TEXTURE_2D, format, GL_INTERNALFORMAT_SUPPORTED, 1, - &supported); - return supported == GL_TRUE; - }) == formats.end(); + static constexpr std::array required_support = { + GL_VERTEX_TEXTURE, GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE, + GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE, GL_COMPUTE_TEXTURE, + }; + + for (const GLenum target : targets) { + for (const GLenum format : formats) { + for (const GLenum support : required_support) { + GLint value; + glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value); + if (value != GL_FULL_SUPPORT) { + return false; + } + } + } + } + return true; } } // Anonymous namespace diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 4faa8b90c5..57db5a08b8 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -404,8 +404,7 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy) - : VideoCommon::ViewBase(params), surface{surface}, - format{GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format}, + : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format}, target{GetTextureTarget(params.target)}, is_proxy{is_proxy} { if (!is_proxy) { main_view = CreateTextureView(); From 4d10d3113fbb22f225fb0353f3f28c39902af11d Mon Sep 17 00:00:00 2001 From: VolcaEM <63682805+VolcaEM@users.noreply.github.com> Date: Mon, 1 Jun 2020 19:38:44 +0200 Subject: [PATCH 027/145] hid: Stub GetXpadIDs Allows Minecraft: Nintendo Switch Edition (a.k.a. old Minecraft) to boot and go ingame --- src/core/hle/service/hid/hid.cpp | 14 +++++++++++++- src/core/hle/service/hid/hid.h | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp index c84cb14831..18fb1d62c4 100644 --- a/src/core/hle/service/hid/hid.cpp +++ b/src/core/hle/service/hid/hid.cpp @@ -161,7 +161,7 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) { {40, nullptr, "AcquireXpadIdEventHandle"}, {41, nullptr, "ReleaseXpadIdEventHandle"}, {51, &Hid::ActivateXpad, "ActivateXpad"}, - {55, nullptr, "GetXpadIds"}, + {55, &Hid::GetXpadIDs, "GetXpadIds"}, {56, nullptr, "ActivateJoyXpad"}, {58, nullptr, "GetJoyXpadLifoHandle"}, {59, nullptr, "GetJoyXpadIds"}, @@ -319,6 +319,18 @@ void Hid::ActivateXpad(Kernel::HLERequestContext& ctx) { rb.Push(RESULT_SUCCESS); } +void Hid::GetXpadIDs(Kernel::HLERequestContext& ctx) { + IPC::RequestParser rp{ctx}; + const auto applet_resource_user_id{rp.Pop()}; + + LOG_DEBUG(Service_HID, "(STUBBED) called, applet_resource_user_id={}", + applet_resource_user_id); + + IPC::ResponseBuilder rb{ctx, 3}; + rb.Push(RESULT_SUCCESS); + rb.Push(0); +} + void Hid::ActivateDebugPad(Kernel::HLERequestContext& ctx) { IPC::RequestParser rp{ctx}; const auto applet_resource_user_id{rp.Pop()}; diff --git a/src/core/hle/service/hid/hid.h b/src/core/hle/service/hid/hid.h index c8ed4ad8b5..d481a75f85 100644 --- a/src/core/hle/service/hid/hid.h +++ b/src/core/hle/service/hid/hid.h @@ -86,6 +86,7 @@ public: private: void CreateAppletResource(Kernel::HLERequestContext& ctx); void ActivateXpad(Kernel::HLERequestContext& ctx); + void GetXpadIDs(Kernel::HLERequestContext& ctx); void ActivateDebugPad(Kernel::HLERequestContext& ctx); void ActivateTouchScreen(Kernel::HLERequestContext& ctx); void ActivateMouse(Kernel::HLERequestContext& ctx); From 8c84a7e7ec25db2165e36bf9cc1365618752d2a1 Mon Sep 17 00:00:00 2001 From: VolcaEM <63682805+VolcaEM@users.noreply.github.com> Date: Mon, 1 Jun 2020 19:42:54 +0200 Subject: [PATCH 028/145] Clang-format --- src/core/hle/service/hid/hid.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp index 18fb1d62c4..72a050de25 100644 --- a/src/core/hle/service/hid/hid.cpp +++ b/src/core/hle/service/hid/hid.cpp @@ -323,8 +323,7 @@ void Hid::GetXpadIDs(Kernel::HLERequestContext& ctx) { IPC::RequestParser rp{ctx}; const auto applet_resource_user_id{rp.Pop()}; - LOG_DEBUG(Service_HID, "(STUBBED) called, applet_resource_user_id={}", - applet_resource_user_id); + LOG_DEBUG(Service_HID, "(STUBBED) called, applet_resource_user_id={}", applet_resource_user_id); IPC::ResponseBuilder rb{ctx, 3}; rb.Push(RESULT_SUCCESS); From 70188d69b0deca34d6c98baa4a0b680692d79066 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Mon, 1 Jun 2020 13:34:18 -0400 Subject: [PATCH 029/145] gl_shader_decompiler: Fix geometry shader outputs for Intel drivers On Intel's proprietary drivers, gl_Layer and gl_ViewportIndex are not allowed members of gl_PerVertex block, causing the shader to fail to compile. Fix this by declaring these variables outside of gl_PerVertex. --- .../renderer_opengl/gl_shader_decompiler.cpp | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 9cb1159599..f5a592490c 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -618,19 +618,6 @@ private: break; } } - if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) { - if (ir.UsesLayer()) { - code.AddLine("int gl_Layer;"); - } - if (ir.UsesViewportIndex()) { - code.AddLine("int gl_ViewportIndex;"); - } - } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderType::Vertex && - !device.HasVertexViewportLayer()) { - LOG_ERROR( - Render_OpenGL, - "GL_ARB_shader_viewport_layer_array is not available and its required by a shader"); - } if (ir.UsesPointSize()) { code.AddLine("float gl_PointSize;"); @@ -647,6 +634,21 @@ private: --code.scope; code.AddLine("}};"); code.AddNewLine(); + + if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) { + if (ir.UsesLayer()) { + code.AddLine("out int gl_Layer;"); + } + if (ir.UsesViewportIndex()) { + code.AddLine("out int gl_ViewportIndex;"); + } + } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderType::Vertex && + !device.HasVertexViewportLayer()) { + LOG_ERROR( + Render_OpenGL, + "GL_ARB_shader_viewport_layer_array is not available and its required by a shader"); + } + code.AddNewLine(); } void DeclareRegisters() { From 74f2e5f1a460e8f429cc41032bd680548d3b1f46 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Mon, 1 Jun 2020 15:35:44 -0400 Subject: [PATCH 030/145] gl_shader_decompiler: Declare gl_Layer and gl_ViewportIndex within gl_PerVertex for vertex and tessellation shaders --- .../renderer_opengl/gl_shader_decompiler.cpp | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index f5a592490c..2c818f406a 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -619,6 +619,21 @@ private: } } + if (stage != ShaderType::Geometry && + (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) { + if (ir.UsesLayer()) { + code.AddLine("int gl_Layer;"); + } + if (ir.UsesViewportIndex()) { + code.AddLine("int gl_ViewportIndex;"); + } + } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderType::Vertex && + !device.HasVertexViewportLayer()) { + LOG_ERROR( + Render_OpenGL, + "GL_ARB_shader_viewport_layer_array is not available and its required by a shader"); + } + if (ir.UsesPointSize()) { code.AddLine("float gl_PointSize;"); } @@ -635,18 +650,13 @@ private: code.AddLine("}};"); code.AddNewLine(); - if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) { + if (stage == ShaderType::Geometry) { if (ir.UsesLayer()) { code.AddLine("out int gl_Layer;"); } if (ir.UsesViewportIndex()) { code.AddLine("out int gl_ViewportIndex;"); } - } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderType::Vertex && - !device.HasVertexViewportLayer()) { - LOG_ERROR( - Render_OpenGL, - "GL_ARB_shader_viewport_layer_array is not available and its required by a shader"); } code.AddNewLine(); } From 3a59e724c9f482531ece27f992139b6c5800422e Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Tue, 2 Jun 2020 02:11:32 -0300 Subject: [PATCH 031/145] maxwell_to_vk: Add R16UI image format - Used by Octopath Traveler --- .../renderer_vulkan/maxwell_to_vk.cpp | 2 +- src/video_core/renderer_vulkan/vk_device.cpp | 143 +++++++++--------- 2 files changed, 74 insertions(+), 71 deletions(-) diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 2871035f50..62e950d310 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -149,7 +149,7 @@ struct FormatTuple { {VK_FORMAT_R16_SFLOAT, Attachable | Storage}, // R16F {VK_FORMAT_R16_UNORM, Attachable | Storage}, // R16U {VK_FORMAT_UNDEFINED}, // R16S - {VK_FORMAT_UNDEFINED}, // R16UI + {VK_FORMAT_R16_UINT, Attachable | Storage}, // R16UI {VK_FORMAT_UNDEFINED}, // R16I {VK_FORMAT_R16G16_UNORM, Attachable | Storage}, // RG16 {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage}, // RG16F diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index 750e5a0caa..9fd8ac3f6e 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp @@ -73,76 +73,79 @@ VkFormatFeatureFlags GetFormatFeatures(VkFormatProperties properties, FormatType std::unordered_map GetFormatProperties( vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) { - static constexpr std::array formats{VK_FORMAT_A8B8G8R8_UNORM_PACK32, - VK_FORMAT_A8B8G8R8_UINT_PACK32, - VK_FORMAT_A8B8G8R8_SNORM_PACK32, - VK_FORMAT_A8B8G8R8_SRGB_PACK32, - VK_FORMAT_B5G6R5_UNORM_PACK16, - VK_FORMAT_A2B10G10R10_UNORM_PACK32, - VK_FORMAT_A1R5G5B5_UNORM_PACK16, - VK_FORMAT_R32G32B32A32_SFLOAT, - VK_FORMAT_R32G32B32A32_UINT, - VK_FORMAT_R32G32_SFLOAT, - VK_FORMAT_R32G32_UINT, - VK_FORMAT_R16G16B16A16_UINT, - VK_FORMAT_R16G16B16A16_SNORM, - VK_FORMAT_R16G16B16A16_UNORM, - VK_FORMAT_R16G16_UNORM, - VK_FORMAT_R16G16_SNORM, - VK_FORMAT_R16G16_SFLOAT, - VK_FORMAT_R16_UNORM, - VK_FORMAT_R8G8B8A8_SRGB, - VK_FORMAT_R8G8_UNORM, - VK_FORMAT_R8G8_SNORM, - VK_FORMAT_R8G8_UINT, - VK_FORMAT_R8_UNORM, - VK_FORMAT_R8_UINT, - VK_FORMAT_B10G11R11_UFLOAT_PACK32, - VK_FORMAT_R32_SFLOAT, - VK_FORMAT_R32_UINT, - VK_FORMAT_R32_SINT, - VK_FORMAT_R16_SFLOAT, - VK_FORMAT_R16G16B16A16_SFLOAT, - VK_FORMAT_B8G8R8A8_UNORM, - VK_FORMAT_B8G8R8A8_SRGB, - VK_FORMAT_R4G4B4A4_UNORM_PACK16, - VK_FORMAT_D32_SFLOAT, - VK_FORMAT_D16_UNORM, - VK_FORMAT_D16_UNORM_S8_UINT, - VK_FORMAT_D24_UNORM_S8_UINT, - VK_FORMAT_D32_SFLOAT_S8_UINT, - VK_FORMAT_BC1_RGBA_UNORM_BLOCK, - VK_FORMAT_BC2_UNORM_BLOCK, - VK_FORMAT_BC3_UNORM_BLOCK, - VK_FORMAT_BC4_UNORM_BLOCK, - VK_FORMAT_BC5_UNORM_BLOCK, - VK_FORMAT_BC5_SNORM_BLOCK, - VK_FORMAT_BC7_UNORM_BLOCK, - VK_FORMAT_BC6H_UFLOAT_BLOCK, - VK_FORMAT_BC6H_SFLOAT_BLOCK, - VK_FORMAT_BC1_RGBA_SRGB_BLOCK, - VK_FORMAT_BC2_SRGB_BLOCK, - VK_FORMAT_BC3_SRGB_BLOCK, - VK_FORMAT_BC7_SRGB_BLOCK, - VK_FORMAT_ASTC_4x4_SRGB_BLOCK, - VK_FORMAT_ASTC_8x8_SRGB_BLOCK, - VK_FORMAT_ASTC_8x5_SRGB_BLOCK, - VK_FORMAT_ASTC_5x4_SRGB_BLOCK, - VK_FORMAT_ASTC_5x5_UNORM_BLOCK, - VK_FORMAT_ASTC_5x5_SRGB_BLOCK, - VK_FORMAT_ASTC_10x8_UNORM_BLOCK, - VK_FORMAT_ASTC_10x8_SRGB_BLOCK, - VK_FORMAT_ASTC_6x6_UNORM_BLOCK, - VK_FORMAT_ASTC_6x6_SRGB_BLOCK, - VK_FORMAT_ASTC_10x10_UNORM_BLOCK, - VK_FORMAT_ASTC_10x10_SRGB_BLOCK, - VK_FORMAT_ASTC_12x12_UNORM_BLOCK, - VK_FORMAT_ASTC_12x12_SRGB_BLOCK, - VK_FORMAT_ASTC_8x6_UNORM_BLOCK, - VK_FORMAT_ASTC_8x6_SRGB_BLOCK, - VK_FORMAT_ASTC_6x5_UNORM_BLOCK, - VK_FORMAT_ASTC_6x5_SRGB_BLOCK, - VK_FORMAT_E5B9G9R9_UFLOAT_PACK32}; + static constexpr std::array formats{ + VK_FORMAT_A8B8G8R8_UNORM_PACK32, + VK_FORMAT_A8B8G8R8_UINT_PACK32, + VK_FORMAT_A8B8G8R8_SNORM_PACK32, + VK_FORMAT_A8B8G8R8_SRGB_PACK32, + VK_FORMAT_B5G6R5_UNORM_PACK16, + VK_FORMAT_A2B10G10R10_UNORM_PACK32, + VK_FORMAT_A1R5G5B5_UNORM_PACK16, + VK_FORMAT_R32G32B32A32_SFLOAT, + VK_FORMAT_R32G32B32A32_UINT, + VK_FORMAT_R32G32_SFLOAT, + VK_FORMAT_R32G32_UINT, + VK_FORMAT_R16G16B16A16_UINT, + VK_FORMAT_R16G16B16A16_SNORM, + VK_FORMAT_R16G16B16A16_UNORM, + VK_FORMAT_R16G16_UNORM, + VK_FORMAT_R16G16_SNORM, + VK_FORMAT_R16G16_SFLOAT, + VK_FORMAT_R16_UNORM, + VK_FORMAT_R16_UINT, + VK_FORMAT_R8G8B8A8_SRGB, + VK_FORMAT_R8G8_UNORM, + VK_FORMAT_R8G8_SNORM, + VK_FORMAT_R8G8_UINT, + VK_FORMAT_R8_UNORM, + VK_FORMAT_R8_UINT, + VK_FORMAT_B10G11R11_UFLOAT_PACK32, + VK_FORMAT_R32_SFLOAT, + VK_FORMAT_R32_UINT, + VK_FORMAT_R32_SINT, + VK_FORMAT_R16_SFLOAT, + VK_FORMAT_R16G16B16A16_SFLOAT, + VK_FORMAT_B8G8R8A8_UNORM, + VK_FORMAT_B8G8R8A8_SRGB, + VK_FORMAT_R4G4B4A4_UNORM_PACK16, + VK_FORMAT_D32_SFLOAT, + VK_FORMAT_D16_UNORM, + VK_FORMAT_D16_UNORM_S8_UINT, + VK_FORMAT_D24_UNORM_S8_UINT, + VK_FORMAT_D32_SFLOAT_S8_UINT, + VK_FORMAT_BC1_RGBA_UNORM_BLOCK, + VK_FORMAT_BC2_UNORM_BLOCK, + VK_FORMAT_BC3_UNORM_BLOCK, + VK_FORMAT_BC4_UNORM_BLOCK, + VK_FORMAT_BC5_UNORM_BLOCK, + VK_FORMAT_BC5_SNORM_BLOCK, + VK_FORMAT_BC7_UNORM_BLOCK, + VK_FORMAT_BC6H_UFLOAT_BLOCK, + VK_FORMAT_BC6H_SFLOAT_BLOCK, + VK_FORMAT_BC1_RGBA_SRGB_BLOCK, + VK_FORMAT_BC2_SRGB_BLOCK, + VK_FORMAT_BC3_SRGB_BLOCK, + VK_FORMAT_BC7_SRGB_BLOCK, + VK_FORMAT_ASTC_4x4_SRGB_BLOCK, + VK_FORMAT_ASTC_8x8_SRGB_BLOCK, + VK_FORMAT_ASTC_8x5_SRGB_BLOCK, + VK_FORMAT_ASTC_5x4_SRGB_BLOCK, + VK_FORMAT_ASTC_5x5_UNORM_BLOCK, + VK_FORMAT_ASTC_5x5_SRGB_BLOCK, + VK_FORMAT_ASTC_10x8_UNORM_BLOCK, + VK_FORMAT_ASTC_10x8_SRGB_BLOCK, + VK_FORMAT_ASTC_6x6_UNORM_BLOCK, + VK_FORMAT_ASTC_6x6_SRGB_BLOCK, + VK_FORMAT_ASTC_10x10_UNORM_BLOCK, + VK_FORMAT_ASTC_10x10_SRGB_BLOCK, + VK_FORMAT_ASTC_12x12_UNORM_BLOCK, + VK_FORMAT_ASTC_12x12_SRGB_BLOCK, + VK_FORMAT_ASTC_8x6_UNORM_BLOCK, + VK_FORMAT_ASTC_8x6_SRGB_BLOCK, + VK_FORMAT_ASTC_6x5_UNORM_BLOCK, + VK_FORMAT_ASTC_6x5_SRGB_BLOCK, + VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, + }; std::unordered_map format_properties; for (const auto format : formats) { format_properties.emplace(format, physical.GetFormatProperties(format)); From 4a6b9a1a71cc7fda9dca4d60a26635485cf0c64f Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Mon, 1 Jun 2020 21:41:07 -0300 Subject: [PATCH 032/145] vk_rasterizer: Implement storage texels This is the equivalent of an image buffer on OpenGL. - Used by Octopath Traveler --- .../renderer_vulkan/vk_compute_pipeline.cpp | 3 +- .../renderer_vulkan/vk_descriptor_pool.cpp | 1 + .../renderer_vulkan/vk_pipeline_cache.cpp | 16 ++-- .../renderer_vulkan/vk_rasterizer.cpp | 48 +++++++++--- .../renderer_vulkan/vk_rasterizer.h | 16 +++- .../renderer_vulkan/vk_shader_decompiler.cpp | 77 ++++++++++++------- .../renderer_vulkan/vk_shader_decompiler.h | 9 ++- .../renderer_vulkan/vk_texture_cache.cpp | 4 +- 8 files changed, 121 insertions(+), 53 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 8e1b462777..281bf9ac3b 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -53,8 +53,9 @@ vk::DescriptorSetLayout VKComputePipeline::CreateDescriptorSetLayout() const { }; add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, entries.const_buffers.size()); add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, entries.global_buffers.size()); - add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.texel_buffers.size()); + add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.uniform_texels.size()); add_bindings(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, entries.samplers.size()); + add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, entries.storage_texels.size()); add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, entries.images.size()); VkDescriptorSetLayoutCreateInfo ci; diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp index 890fd52cf8..9259b618d5 100644 --- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp @@ -42,6 +42,7 @@ vk::DescriptorPool* VKDescriptorPool::AllocateNewPool() { {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, num_sets * 60}, {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, num_sets * 64}, {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, num_sets * 64}, + {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, num_sets * 64}, {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40}}; VkDescriptorPoolCreateInfo ci; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 65a1c62454..b8ccf164f7 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -45,6 +45,7 @@ constexpr VkDescriptorType UNIFORM_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; constexpr VkDescriptorType STORAGE_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; constexpr VkDescriptorType UNIFORM_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER; constexpr VkDescriptorType COMBINED_IMAGE_SAMPLER = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; +constexpr VkDescriptorType STORAGE_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER; constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; constexpr VideoCommon::Shader::CompilerSettings compiler_settings{ @@ -104,8 +105,9 @@ u32 FillDescriptorLayout(const ShaderEntries& entries, u32 binding = base_binding; AddBindings(bindings, binding, flags, entries.const_buffers); AddBindings(bindings, binding, flags, entries.global_buffers); - AddBindings(bindings, binding, flags, entries.texel_buffers); + AddBindings(bindings, binding, flags, entries.uniform_texels); AddBindings(bindings, binding, flags, entries.samplers); + AddBindings(bindings, binding, flags, entries.storage_texels); AddBindings(bindings, binding, flags, entries.images); return binding; } @@ -377,16 +379,17 @@ void AddEntry(std::vector& template_entries, u3 return; } - if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER) { - // Nvidia has a bug where updating multiple uniform texels at once causes the driver to - // crash. + if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER || + descriptor_type == STORAGE_TEXEL_BUFFER) { + // Nvidia has a bug where updating multiple texels at once causes the driver to crash. + // Note: Fixed in driver Windows 443.24, Linux 440.66.15 for (u32 i = 0; i < count; ++i) { VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back(); entry.dstBinding = binding + i; entry.dstArrayElement = 0; entry.descriptorCount = 1; entry.descriptorType = descriptor_type; - entry.offset = offset + i * entry_size; + entry.offset = static_cast(offset + i * entry_size); entry.stride = entry_size; } } else if (count > 0) { @@ -407,8 +410,9 @@ void FillDescriptorUpdateTemplateEntries( std::vector& template_entries) { AddEntry(template_entries, offset, binding, entries.const_buffers); AddEntry(template_entries, offset, binding, entries.global_buffers); - AddEntry(template_entries, offset, binding, entries.texel_buffers); + AddEntry(template_entries, offset, binding, entries.uniform_texels); AddEntry(template_entries, offset, binding, entries.samplers); + AddEntry(template_entries, offset, binding, entries.storage_texels); AddEntry(template_entries, offset, binding, entries.images); } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index a3d992ed3f..b44b5237a9 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -468,8 +468,9 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { const auto& entries = pipeline.GetEntries(); SetupComputeConstBuffers(entries); SetupComputeGlobalBuffers(entries); - SetupComputeTexelBuffers(entries); + SetupComputeUniformTexels(entries); SetupComputeTextures(entries); + SetupComputeStorageTexels(entries); SetupComputeImages(entries); buffer_cache.Unmap(); @@ -787,8 +788,9 @@ void RasterizerVulkan::SetupShaderDescriptors( const auto& entries = shader->GetEntries(); SetupGraphicsConstBuffers(entries, stage); SetupGraphicsGlobalBuffers(entries, stage); - SetupGraphicsTexelBuffers(entries, stage); + SetupGraphicsUniformTexels(entries, stage); SetupGraphicsTextures(entries, stage); + SetupGraphicsStorageTexels(entries, stage); SetupGraphicsImages(entries, stage); } texture_cache.GuardSamplers(false); @@ -976,12 +978,12 @@ void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, } } -void RasterizerVulkan::SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage) { +void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) { MICROPROFILE_SCOPE(Vulkan_Textures); const auto& gpu = system.GPU().Maxwell3D(); - for (const auto& entry : entries.texel_buffers) { + for (const auto& entry : entries.uniform_texels) { const auto image = GetTextureInfo(gpu, entry, stage).tic; - SetupTexelBuffer(image, entry); + SetupUniformTexels(image, entry); } } @@ -996,6 +998,15 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std:: } } +void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) { + MICROPROFILE_SCOPE(Vulkan_Textures); + const auto& gpu = system.GPU().Maxwell3D(); + for (const auto& entry : entries.storage_texels) { + const auto image = GetTextureInfo(gpu, entry, stage).tic; + SetupStorageTexel(image, entry); + } +} + void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) { MICROPROFILE_SCOPE(Vulkan_Images); const auto& gpu = system.GPU().Maxwell3D(); @@ -1028,12 +1039,12 @@ void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) { } } -void RasterizerVulkan::SetupComputeTexelBuffers(const ShaderEntries& entries) { +void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_Textures); const auto& gpu = system.GPU().KeplerCompute(); - for (const auto& entry : entries.texel_buffers) { + for (const auto& entry : entries.uniform_texels) { const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; - SetupTexelBuffer(image, entry); + SetupUniformTexels(image, entry); } } @@ -1048,6 +1059,15 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) { } } +void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) { + MICROPROFILE_SCOPE(Vulkan_Textures); + const auto& gpu = system.GPU().KeplerCompute(); + for (const auto& entry : entries.storage_texels) { + const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; + SetupStorageTexel(image, entry); + } +} + void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_Images); const auto& gpu = system.GPU().KeplerCompute(); @@ -1097,8 +1117,8 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd update_descriptor_queue.AddBuffer(buffer, offset, size); } -void RasterizerVulkan::SetupTexelBuffer(const Tegra::Texture::TICEntry& tic, - const TexelBufferEntry& entry) { +void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic, + const UniformTexelEntry& entry) { const auto view = texture_cache.GetTextureSurface(tic, entry); ASSERT(view->IsBufferView()); @@ -1120,6 +1140,14 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu sampled_views.push_back(ImageView{std::move(view), image_layout}); } +void RasterizerVulkan::SetupStorageTexel(const Tegra::Texture::TICEntry& tic, + const StorageTexelEntry& entry) { + const auto view = texture_cache.GetImageSurface(tic, entry); + ASSERT(view->IsBufferView()); + + update_descriptor_queue.AddTexelBuffer(view->GetBufferView()); +} + void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) { auto view = texture_cache.GetImageSurface(tic, entry); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 0ed0e48c67..04be37a5ee 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -193,12 +193,15 @@ private: /// Setup global buffers in the graphics pipeline. void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage); - /// Setup texel buffers in the graphics pipeline. - void SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage); + /// Setup uniform texels in the graphics pipeline. + void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage); /// Setup textures in the graphics pipeline. void SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage); + /// Setup storage texels in the graphics pipeline. + void SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage); + /// Setup images in the graphics pipeline. void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage); @@ -209,11 +212,14 @@ private: void SetupComputeGlobalBuffers(const ShaderEntries& entries); /// Setup texel buffers in the compute pipeline. - void SetupComputeTexelBuffers(const ShaderEntries& entries); + void SetupComputeUniformTexels(const ShaderEntries& entries); /// Setup textures in the compute pipeline. void SetupComputeTextures(const ShaderEntries& entries); + /// Setup storage texels in the compute pipeline. + void SetupComputeStorageTexels(const ShaderEntries& entries); + /// Setup images in the compute pipeline. void SetupComputeImages(const ShaderEntries& entries); @@ -222,10 +228,12 @@ private: void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address); - void SetupTexelBuffer(const Tegra::Texture::TICEntry& image, const TexelBufferEntry& entry); + void SetupUniformTexels(const Tegra::Texture::TICEntry& image, const UniformTexelEntry& entry); void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry); + void SetupStorageTexel(const Tegra::Texture::TICEntry& tic, const StorageTexelEntry& entry); + void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index a13e8baa72..b9128bd380 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -400,8 +400,9 @@ private: u32 binding = specialization.base_binding; binding = DeclareConstantBuffers(binding); binding = DeclareGlobalBuffers(binding); - binding = DeclareTexelBuffers(binding); + binding = DeclareUniformTexels(binding); binding = DeclareSamplers(binding); + binding = DeclareStorageTexels(binding); binding = DeclareImages(binding); const Id main = OpFunction(t_void, {}, TypeFunction(t_void)); @@ -889,7 +890,7 @@ private: return binding; } - u32 DeclareTexelBuffers(u32 binding) { + u32 DeclareUniformTexels(u32 binding) { for (const auto& sampler : ir.GetSamplers()) { if (!sampler.is_buffer) { continue; @@ -910,7 +911,7 @@ private: Decorate(id, spv::Decoration::Binding, binding++); Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); - texel_buffers.emplace(sampler.index, TexelBuffer{image_type, id}); + uniform_texels.emplace(sampler.index, TexelBuffer{image_type, id}); } return binding; } @@ -945,31 +946,48 @@ private: return binding; } - u32 DeclareImages(u32 binding) { + u32 DeclareStorageTexels(u32 binding) { for (const auto& image : ir.GetImages()) { - const auto [dim, arrayed] = GetImageDim(image); - constexpr int depth = 0; - constexpr bool ms = false; - constexpr int sampled = 2; // This won't be accessed with a sampler - constexpr auto format = spv::ImageFormat::Unknown; - const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {}); - const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type); - const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant); - AddGlobalVariable(Name(id, fmt::format("image_{}", image.index))); - - Decorate(id, spv::Decoration::Binding, binding++); - Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); - if (image.is_read && !image.is_written) { - Decorate(id, spv::Decoration::NonWritable); - } else if (image.is_written && !image.is_read) { - Decorate(id, spv::Decoration::NonReadable); + if (image.type != Tegra::Shader::ImageType::TextureBuffer) { + continue; } - - images.emplace(image.index, StorageImage{image_type, id}); + DeclareImage(image, binding); } return binding; } + u32 DeclareImages(u32 binding) { + for (const auto& image : ir.GetImages()) { + if (image.type == Tegra::Shader::ImageType::TextureBuffer) { + continue; + } + DeclareImage(image, binding); + } + return binding; + } + + void DeclareImage(const Image& image, u32& binding) { + const auto [dim, arrayed] = GetImageDim(image); + constexpr int depth = 0; + constexpr bool ms = false; + constexpr int sampled = 2; // This won't be accessed with a sampler + constexpr auto format = spv::ImageFormat::Unknown; + const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {}); + const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type); + const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant); + AddGlobalVariable(Name(id, fmt::format("image_{}", image.index))); + + Decorate(id, spv::Decoration::Binding, binding++); + Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); + if (image.is_read && !image.is_written) { + Decorate(id, spv::Decoration::NonWritable); + } else if (image.is_written && !image.is_read) { + Decorate(id, spv::Decoration::NonReadable); + } + + images.emplace(image.index, StorageImage{image_type, id}); + } + bool IsRenderTargetEnabled(u32 rt) const { for (u32 component = 0; component < 4; ++component) { if (header.ps.IsColorComponentOutputEnabled(rt, component)) { @@ -1674,7 +1692,7 @@ private: const auto& meta = std::get(operation.GetMeta()); const u32 index = meta.sampler.index; if (meta.sampler.is_buffer) { - const auto& entry = texel_buffers.at(index); + const auto& entry = uniform_texels.at(index); return OpLoad(entry.image_type, entry.image); } else { const auto& entry = sampled_images.at(index); @@ -2794,15 +2812,16 @@ private: std::unordered_map output_attributes; std::map constant_buffers; std::map global_buffers; - std::map texel_buffers; + std::map uniform_texels; std::map sampled_images; + std::map storage_texels; std::map images; + std::array frag_colors{}; Id instance_index{}; Id vertex_index{}; Id base_instance{}; Id base_vertex{}; - std::array frag_colors{}; Id frag_depth{}; Id frag_coord{}; Id front_facing{}; @@ -3058,13 +3077,17 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) { } for (const auto& sampler : ir.GetSamplers()) { if (sampler.is_buffer) { - entries.texel_buffers.emplace_back(sampler); + entries.uniform_texels.emplace_back(sampler); } else { entries.samplers.emplace_back(sampler); } } for (const auto& image : ir.GetImages()) { - entries.images.emplace_back(image); + if (image.type == Tegra::Shader::ImageType::TextureBuffer) { + entries.storage_texels.emplace_back(image); + } else { + entries.images.emplace_back(image); + } } for (const auto& attribute : ir.GetInputAttributes()) { if (IsGenericAttribute(attribute)) { diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index b7af263882..2b0e903967 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -21,8 +21,9 @@ class VKDevice; namespace Vulkan { using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using TexelBufferEntry = VideoCommon::Shader::Sampler; +using UniformTexelEntry = VideoCommon::Shader::Sampler; using SamplerEntry = VideoCommon::Shader::Sampler; +using StorageTexelEntry = VideoCommon::Shader::Image; using ImageEntry = VideoCommon::Shader::Image; constexpr u32 DESCRIPTOR_SET = 0; @@ -66,13 +67,15 @@ private: struct ShaderEntries { u32 NumBindings() const { return static_cast(const_buffers.size() + global_buffers.size() + - texel_buffers.size() + samplers.size() + images.size()); + uniform_texels.size() + samplers.size() + storage_texels.size() + + images.size()); } std::vector const_buffers; std::vector global_buffers; - std::vector texel_buffers; + std::vector uniform_texels; std::vector samplers; + std::vector storage_texels; std::vector images; std::set attributes; std::array clip_distances{}; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 2f1d5021d0..ea487b7708 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -100,8 +100,8 @@ vk::Buffer CreateBuffer(const VKDevice& device, const SurfaceParams& params, ci.pNext = nullptr; ci.flags = 0; ci.size = static_cast(host_memory_size); - ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT; + ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE; ci.queueFamilyIndexCount = 0; ci.pQueueFamilyIndices = nullptr; From 866c1165afea021b546de46bf2fc909dd5733392 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Tue, 2 Jun 2020 02:12:25 -0300 Subject: [PATCH 033/145] vk_shader_decompiler: Implement atomic image operations Implement atomic operations on images. On GLSL these are atomicImage* functions (e.g. atomicImageAdd). --- externals/sirit | 2 +- .../renderer_vulkan/vk_shader_decompiler.cpp | 64 +++++++------------ 2 files changed, 25 insertions(+), 41 deletions(-) diff --git a/externals/sirit b/externals/sirit index a62c5bbc10..eefca56afd 160000 --- a/externals/sirit +++ b/externals/sirit @@ -1 +1 @@ -Subproject commit a62c5bbc100a5e5a31ea0ccc4a78d8fa6a4167ce +Subproject commit eefca56afd49379bdebc97ded8b480839f930881 diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index b9128bd380..97429cc59f 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -971,7 +971,7 @@ private: constexpr int depth = 0; constexpr bool ms = false; constexpr int sampled = 2; // This won't be accessed with a sampler - constexpr auto format = spv::ImageFormat::Unknown; + const auto format = image.is_atomic ? spv::ImageFormat::R32ui : spv::ImageFormat::Unknown; const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {}); const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type); const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant); @@ -1274,7 +1274,7 @@ private: } else { UNREACHABLE_MSG("Unmanaged offset node type"); } - pointer = OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0), buffer_index, + pointer = OpAccessChain(t_cbuf_float, buffer_id, v_uint_zero, buffer_index, buffer_element); } return {OpLoad(t_float, pointer), Type::Float}; @@ -1629,7 +1629,7 @@ private: const Id result = OpIAddCarry(TypeStruct({t_uint, t_uint}), op_a, op_b); const Id carry = OpCompositeExtract(t_uint, result, 1); - return {OpINotEqual(t_bool, carry, Constant(t_uint, 0)), Type::Bool}; + return {OpINotEqual(t_bool, carry, v_uint_zero), Type::Bool}; } Expression LogicalAssign(Operation operation) { @@ -1969,39 +1969,20 @@ private: return {}; } - Expression AtomicImageAdd(Operation operation) { - UNIMPLEMENTED(); - return {}; - } + template + Expression AtomicImage(Operation operation) { + const auto& meta{std::get(operation.GetMeta())}; + ASSERT(meta.values.size() == 1); - Expression AtomicImageMin(Operation operation) { - UNIMPLEMENTED(); - return {}; - } + const Id coordinate = GetCoordinates(operation, Type::Int); + const Id image = images.at(meta.image.index).image; + const Id sample = v_uint_zero; + const Id pointer = OpImageTexelPointer(t_image_uint, image, coordinate, sample); - Expression AtomicImageMax(Operation operation) { - UNIMPLEMENTED(); - return {}; - } - - Expression AtomicImageAnd(Operation operation) { - UNIMPLEMENTED(); - return {}; - } - - Expression AtomicImageOr(Operation operation) { - UNIMPLEMENTED(); - return {}; - } - - Expression AtomicImageXor(Operation operation) { - UNIMPLEMENTED(); - return {}; - } - - Expression AtomicImageExchange(Operation operation) { - UNIMPLEMENTED(); - return {}; + const Id scope = Constant(t_uint, static_cast(spv::Scope::Device)); + const Id semantics = v_uint_zero; + const Id value = AsUint(Visit(meta.values[0])); + return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint}; } template @@ -2016,7 +1997,7 @@ private: return {v_float_zero, Type::Float}; } const Id scope = Constant(t_uint, static_cast(spv::Scope::Device)); - const Id semantics = Constant(t_uint, 0); + const Id semantics = v_uint_zero; const Id value = AsUint(Visit(operation[1])); return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint}; @@ -2640,11 +2621,11 @@ private: &SPIRVDecompiler::ImageLoad, &SPIRVDecompiler::ImageStore, - &SPIRVDecompiler::AtomicImageAdd, - &SPIRVDecompiler::AtomicImageAnd, - &SPIRVDecompiler::AtomicImageOr, - &SPIRVDecompiler::AtomicImageXor, - &SPIRVDecompiler::AtomicImageExchange, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicIAdd>, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicAnd>, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicOr>, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicXor>, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicExchange>, &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>, &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>, @@ -2786,8 +2767,11 @@ private: Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0); const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct); + const Id t_image_uint = TypePointer(spv::StorageClass::Image, t_uint); + const Id v_float_zero = Constant(t_float, 0.0f); const Id v_float_one = Constant(t_float, 1.0f); + const Id v_uint_zero = Constant(t_uint, 0); // Nvidia uses these defaults for varyings (e.g. position and generic attributes) const Id v_varying_default = From 3a20e74f4056a03253e563510048e99ea548d5b4 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Tue, 2 Jun 2020 16:37:06 +1000 Subject: [PATCH 034/145] Pass by reference instead of copying parameters --- src/video_core/engines/maxwell_3d.cpp | 10 ++++++---- src/video_core/engines/maxwell_3d.h | 2 +- src/video_core/macro/macro.cpp | 2 +- src/video_core/macro/macro.h | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 934a1d6dbd..0c0a7c1ed7 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -115,7 +115,7 @@ void Maxwell3D::InitializeRegisterDefaults() { mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; } -void Maxwell3D::CallMacroMethod(u32 method, std::vector&& parameters) { +void Maxwell3D::CallMacroMethod(u32 method, std::vector& parameters) { // Reset the current macro. executing_macro = 0; @@ -124,7 +124,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::vector&& parameters) { ((method - MacroRegistersStart) >> 1) % static_cast(macro_positions.size()); // Execute the current macro. - macro_engine->Execute(macro_positions[entry], std::move(parameters)); + macro_engine->Execute(macro_positions[entry], parameters); if (mme_draw.current_mode != MMEDrawMode::Undefined) { FlushMMEInlineDraw(); } @@ -160,7 +160,8 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { // Call the macro when there are no more parameters in the command buffer if (is_last_call) { - CallMacroMethod(executing_macro, std::move(macro_params)); + CallMacroMethod(executing_macro, macro_params); + macro_params.clear(); } return; } @@ -304,7 +305,8 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, // Call the macro when there are no more parameters in the command buffer if (amount == methods_pending) { - CallMacroMethod(executing_macro, std::move(macro_params)); + CallMacroMethod(executing_macro, macro_params); + macro_params.clear(); } return; } diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 077bc9841b..651f37b4d8 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1495,7 +1495,7 @@ private: * @param num_parameters Number of arguments * @param parameters Arguments to the method call */ - void CallMacroMethod(u32 method, std::vector&& parameters); + void CallMacroMethod(u32 method, std::vector& parameters); /// Handles writes to the macro uploading register. void ProcessMacroUpload(u32 data); diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp index 85a6e5dd42..d05a884793 100644 --- a/src/video_core/macro/macro.cpp +++ b/src/video_core/macro/macro.cpp @@ -15,7 +15,7 @@ void MacroEngine::AddCode(u32 method, u32 data) { uploaded_macro_code[method].push_back(data); } -void MacroEngine::Execute(u32 method, std::vector parameters) { +void MacroEngine::Execute(u32 method, std::vector& parameters) { auto compiled_macro = macro_cache.find(method); if (compiled_macro != macro_cache.end()) { compiled_macro->second->Execute(parameters, method); diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h index 28ca243d10..49fc4d6bc2 100644 --- a/src/video_core/macro/macro.h +++ b/src/video_core/macro/macro.h @@ -113,7 +113,7 @@ public: void AddCode(u32 method, u32 data); // Compiles the macro if its not in the cache, and executes the compiled macro - void Execute(u32 method, std::vector parameters); + void Execute(u32 method, std::vector& parameters); protected: virtual std::unique_ptr Compile(const std::vector& code) = 0; From aaa4822fcb60676473243619703c1af24a4d47b9 Mon Sep 17 00:00:00 2001 From: FearlessTobi Date: Wed, 3 Jun 2020 05:17:34 +0200 Subject: [PATCH 035/145] Actually save the input when clearing/resetting to default Co-Authored-By: xperia64 --- src/yuzu/configuration/configure_input_player.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/yuzu/configuration/configure_input_player.cpp b/src/yuzu/configuration/configure_input_player.cpp index e4eb5594bd..a05fa64ba3 100644 --- a/src/yuzu/configuration/configure_input_player.cpp +++ b/src/yuzu/configuration/configure_input_player.cpp @@ -480,7 +480,9 @@ void ConfigureInputPlayer::RestoreDefaults() { SetAnalogButton(params, analogs_param[analog_id], analog_sub_buttons[sub_button_id]); } } + UpdateButtonLabels(); + ApplyConfiguration(); } void ConfigureInputPlayer::ClearAll() { @@ -505,6 +507,7 @@ void ConfigureInputPlayer::ClearAll() { } UpdateButtonLabels(); + ApplyConfiguration(); } void ConfigureInputPlayer::UpdateButtonLabels() { From 411f5527d41ba5c4f09b914b4fb4df0c6493f744 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Wed, 3 Jun 2020 16:33:38 +1000 Subject: [PATCH 036/145] Mark parameters as const --- src/video_core/engines/maxwell_3d.cpp | 2 +- src/video_core/engines/maxwell_3d.h | 3 +-- src/video_core/macro/macro.cpp | 2 +- src/video_core/macro/macro.h | 4 ++-- src/video_core/macro/macro_interpreter.cpp | 2 +- src/video_core/macro/macro_interpreter.h | 2 +- src/video_core/macro/macro_jit_x64.cpp | 2 +- src/video_core/macro/macro_jit_x64.h | 5 +++-- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 0c0a7c1ed7..14ad402502 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -115,7 +115,7 @@ void Maxwell3D::InitializeRegisterDefaults() { mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; } -void Maxwell3D::CallMacroMethod(u32 method, std::vector& parameters) { +void Maxwell3D::CallMacroMethod(u32 method, const std::vector& parameters) { // Reset the current macro. executing_macro = 0; diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 651f37b4d8..b827b112fc 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1466,7 +1466,6 @@ private: /// Interpreter for the macro codes uploaded to the GPU. std::unique_ptr macro_engine; - // MacroInterpreter macro_interpreter; static constexpr u32 null_cb_data = 0xFFFFFFFF; struct { @@ -1495,7 +1494,7 @@ private: * @param num_parameters Number of arguments * @param parameters Arguments to the method call */ - void CallMacroMethod(u32 method, std::vector& parameters); + void CallMacroMethod(u32 method, const std::vector& parameters); /// Handles writes to the macro uploading register. void ProcessMacroUpload(u32 data); diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp index d05a884793..89077a2d85 100644 --- a/src/video_core/macro/macro.cpp +++ b/src/video_core/macro/macro.cpp @@ -15,7 +15,7 @@ void MacroEngine::AddCode(u32 method, u32 data) { uploaded_macro_code[method].push_back(data); } -void MacroEngine::Execute(u32 method, std::vector& parameters) { +void MacroEngine::Execute(u32 method, const std::vector& parameters) { auto compiled_macro = macro_cache.find(method); if (compiled_macro != macro_cache.end()) { compiled_macro->second->Execute(parameters, method); diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h index 49fc4d6bc2..b76ed891f1 100644 --- a/src/video_core/macro/macro.h +++ b/src/video_core/macro/macro.h @@ -102,7 +102,7 @@ public: * @param code The macro byte code to execute * @param parameters The parameters of the macro */ - virtual void Execute(std::vector& parameters, u32 method) = 0; + virtual void Execute(const std::vector& parameters, u32 method) = 0; }; class MacroEngine { @@ -113,7 +113,7 @@ public: void AddCode(u32 method, u32 data); // Compiles the macro if its not in the cache, and executes the compiled macro - void Execute(u32 method, std::vector& parameters); + void Execute(u32 method, const std::vector& parameters); protected: virtual std::unique_ptr Compile(const std::vector& code) = 0; diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp index e63296a21f..5edff27aad 100644 --- a/src/video_core/macro/macro_interpreter.cpp +++ b/src/video_core/macro/macro_interpreter.cpp @@ -21,7 +21,7 @@ MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector& code) : maxwell3d(maxwell3d), code(code) {} -void MacroInterpreterImpl::Execute(std::vector& parameters, u32 method) { +void MacroInterpreterImpl::Execute(const std::vector& parameters, u32 method) { MICROPROFILE_SCOPE(MacroInterp); Reset(); diff --git a/src/video_core/macro/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h index fb923f7b9f..90217fc89f 100644 --- a/src/video_core/macro/macro_interpreter.h +++ b/src/video_core/macro/macro_interpreter.h @@ -29,7 +29,7 @@ private: class MacroInterpreterImpl : public CachedMacro { public: MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector& code); - void Execute(std::vector& parameters, u32 method) override; + void Execute(const std::vector& parameters, u32 method) override; private: /// Resets the execution engine state, zeroing registers, etc. diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 48501e582c..11c1cc3be8 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -47,7 +47,7 @@ MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vecto MacroJITx64Impl::~MacroJITx64Impl() = default; -void MacroJITx64Impl::Execute(std::vector& parameters, u32 method) { +void MacroJITx64Impl::Execute(const std::vector& parameters, u32 method) { MICROPROFILE_SCOPE(MacroJitExecute); ASSERT_OR_EXECUTE(program != nullptr, { return; }); JITState state{}; diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index 729ed77137..6152cb5013 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -13,6 +13,7 @@ #include "video_core/macro/macro.h" namespace Tegra { + namespace Engines { class Maxwell3D; } @@ -36,7 +37,7 @@ public: MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector& code); ~MacroJITx64Impl(); - void Execute(std::vector& parameters, u32 method) override; + void Execute(const std::vector& parameters, u32 method) override; void Compile_ALU(Macro::Opcode opcode); void Compile_AddImmediate(Macro::Opcode opcode); @@ -66,7 +67,7 @@ private: struct JITState { Engines::Maxwell3D* maxwell3d{}; std::array registers{}; - u32* parameters{}; + const u32* parameters{}; u32 carry_flag{}; }; static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0"); From 3d99b449d3c285a50ffb68ae077882cc2998d62f Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 3 Jun 2020 02:48:41 -0300 Subject: [PATCH 037/145] gl_rasterizer: Use NV_transform_feedback for XFB on assembly shaders NV_transform_feedback, NV_transform_feedback2 and ARB_transform_feedback3 with NV_transform_feedback interactions allows implementing transform feedbacks as dynamic state. Maxwell implements transform feedbacks as dynamic state, so using these extensions with TransformFeedbackStreamAttribsNV allows us to properly emulate transform feedbacks without having to recompile shaders when the state changes. --- src/video_core/renderer_opengl/gl_device.cpp | 3 +- .../renderer_opengl/gl_rasterizer.cpp | 90 +++++++++++++++++++ .../renderer_opengl/gl_rasterizer.h | 4 + 3 files changed, 96 insertions(+), 1 deletion(-) diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index b772c37d90..8b424e2cbf 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -206,7 +206,8 @@ Device::Device() has_precise_bug = TestPreciseBug(); has_fast_buffer_sub_data = is_nvidia; use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 && - GLAD_GL_NV_compute_program5; + GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback && + GLAD_GL_NV_transform_feedback2; LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 55e79aaf65..f802fd384d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -93,6 +93,34 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, return buffer.size; } +/// Translates hardware transform feedback indices +/// @param location Hardware location +/// @return Pair of ARB_transform_feedback3 token stream first and third arguments +/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt +std::pair TransformFeedbackEnum(u8 location) { + const u8 index = location / 4; + if (index >= 8 && index <= 39) { + return {GL_GENERIC_ATTRIB_NV, index - 8}; + } + if (index >= 48 && index <= 55) { + return {GL_TEXTURE_COORD_NV, index - 48}; + } + switch (index) { + case 7: + return {GL_POSITION, 0}; + case 40: + return {GL_PRIMARY_COLOR_NV, 0}; + case 41: + return {GL_SECONDARY_COLOR_NV, 0}; + case 42: + return {GL_BACK_PRIMARY_COLOR_NV, 0}; + case 43: + return {GL_BACK_SECONDARY_COLOR_NV, 0}; + } + UNIMPLEMENTED_MSG("index={}", static_cast(index)); + return {GL_POSITION, 0}; +} + void oglEnable(GLenum cap, bool state) { (state ? glEnable : glDisable)(cap); } @@ -1547,12 +1575,70 @@ void RasterizerOpenGL::SyncFramebufferSRGB() { oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb); } +void RasterizerOpenGL::SyncTransformFeedback() { + // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal + // when this is required. + const auto& regs = system.GPU().Maxwell3D().regs; + + static constexpr std::size_t STRIDE = 3; + std::array attribs; + std::array streams; + + GLint* cursor = attribs.data(); + GLint* current_stream = streams.data(); + + for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) { + const auto& layout = regs.tfb_layouts[feedback]; + UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding"); + if (layout.varying_count == 0) { + continue; + } + + *current_stream = static_cast(feedback); + if (current_stream != streams.data()) { + // When stepping one stream, push the expected token + cursor[0] = GL_NEXT_BUFFER_NV; + cursor[1] = 0; + cursor[2] = 0; + cursor += STRIDE; + } + ++current_stream; + + const auto& locations = regs.tfb_varying_locs[feedback]; + std::optional current_index; + for (u32 offset = 0; offset < layout.varying_count; ++offset) { + const u8 location = locations[offset]; + const u8 index = location / 4; + + if (current_index == index) { + // Increase number of components of the previous attachment + ++cursor[-2]; + continue; + } + current_index = index; + + std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location); + cursor[1] = 1; + cursor += STRIDE; + } + } + + const GLsizei num_attribs = static_cast((cursor - attribs.data()) / STRIDE); + const GLsizei num_strides = static_cast(current_stream - streams.data()); + glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(), + GL_INTERLEAVED_ATTRIBS); +} + void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { const auto& regs = system.GPU().Maxwell3D().regs; if (regs.tfb_enabled == 0) { return; } + if (device.UseAssemblyShaders()) { + SyncTransformFeedback(); + } + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); @@ -1579,6 +1665,10 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { static_cast(size)); } + // We may have to call BeginTransformFeedbackNV here since they seem to call different + // implementations on Nvidia's driver (the pointer is different) but we are using + // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB + // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works. glBeginTransformFeedback(GL_POINTS); } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index f5dc56a0e2..7abc8fdbd4 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -202,6 +202,10 @@ private: /// Syncs the framebuffer sRGB state to match the guest state void SyncFramebufferSRGB(); + /// Syncs transform feedback state to match guest state + /// @note Only valid on assembly shaders + void SyncTransformFeedback(); + /// Begin a transform feedback void BeginTransformFeedback(GLenum primitive_mode); From eca3d16e548451f403cc4cc2e9fb38ed96693102 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Thu, 4 Jun 2020 22:23:07 +1000 Subject: [PATCH 038/145] Default init labels and use initializer list for macro engine --- src/video_core/engines/maxwell_3d.cpp | 2 +- src/video_core/macro/macro_jit_x64.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 14ad402502..951016c3e8 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -25,7 +25,7 @@ constexpr u32 MacroRegistersStart = 0xE00; Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, - macro_engine(GetMacroEngine(*this)), upload_state{memory_manager, regs.upload} { + macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} { dirty.flags.flip(); InitializeRegisterDefaults(); } diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index 6152cb5013..21ee157cf4 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -85,7 +85,7 @@ private: std::optional next_opcode{}; ProgramType program{nullptr}; - std::array labels; + std::array labels{}; std::array delay_skip{}; Xbyak::Label end_of_code{}; From c0d2e3212f0d8c3a8dee21b13e7d9a53815ca97d Mon Sep 17 00:00:00 2001 From: David Marcec Date: Thu, 4 Jun 2020 22:27:15 +1000 Subject: [PATCH 039/145] Downgrade "handle not signaled" error to trace clogs logs quite a bit --- src/core/hle/kernel/readable_event.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/hle/kernel/readable_event.cpp b/src/core/hle/kernel/readable_event.cpp index 00860fcbd1..ef5e19e63b 100644 --- a/src/core/hle/kernel/readable_event.cpp +++ b/src/core/hle/kernel/readable_event.cpp @@ -38,7 +38,7 @@ void ReadableEvent::Clear() { ResultCode ReadableEvent::Reset() { if (!is_signaled) { - LOG_ERROR(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}", + LOG_TRACE(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}", GetObjectId(), GetTypeName(), GetName()); return ERR_INVALID_STATE; } From e1438f8e913293dd70fb4ee7818312c3f6d64d77 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 4 Jun 2020 23:02:55 -0300 Subject: [PATCH 040/145] shader/track: Move bindless tracking to a separate function --- src/video_core/shader/shader_ir.h | 10 ++++-- src/video_core/shader/track.cpp | 54 ++++++++++++++++++------------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 15ae152f25..8f522edf0a 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -409,8 +409,14 @@ private: std::tuple TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const; - std::tuple TrackBindlessSampler(Node tracked, const NodeBlock& code, - s64 cursor); + std::pair TrackBindlessSampler(Node tracked, const NodeBlock& code, + s64 cursor); + + std::pair HandleBindlessIndirectRead(const CbufNode& cbuf, + const OperationNode& operation, + Node gpr, Node base_offset, + Node tracked, const NodeBlock& code, + s64 cursor); std::optional TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const; diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index eb97bfd411..435f4facbc 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -14,6 +14,7 @@ namespace VideoCommon::Shader { namespace { + std::pair FindOperation(const NodeBlock& code, s64 cursor, OperationCode operation_code) { for (; cursor >= 0; --cursor) { @@ -72,40 +73,27 @@ bool AmendNodeCv(std::size_t amend_index, Node node) { } // Anonymous namespace -std::tuple ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code, - s64 cursor) { +std::pair ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code, + s64 cursor) { if (const auto cbuf = std::get_if(&*tracked)) { + const u32 cbuf_index = cbuf->GetIndex(); + // Constant buffer found, test if it's an immediate const auto& offset = cbuf->GetOffset(); if (const auto immediate = std::get_if(&*offset)) { - auto track = - MakeTrackSampler(cbuf->GetIndex(), immediate->GetValue()); + auto track = MakeTrackSampler(cbuf_index, immediate->GetValue()); return {tracked, track}; } if (const auto operation = std::get_if(&*offset)) { const u32 bound_buffer = registry.GetBoundBuffer(); - if (bound_buffer != cbuf->GetIndex()) { + if (bound_buffer != cbuf_index) { return {}; } - const auto pair = DecoupleIndirectRead(*operation); - if (!pair) { - return {}; + if (const std::optional pair = DecoupleIndirectRead(*operation)) { + auto [gpr, base_offset] = *pair; + return HandleBindlessIndirectRead(*cbuf, *operation, gpr, base_offset, tracked, + code, cursor); } - auto [gpr, base_offset] = *pair; - const auto offset_inm = std::get_if(&*base_offset); - const auto& gpu_driver = registry.AccessGuestDriverProfile(); - const u32 bindless_cv = NewCustomVariable(); - Node op = - Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize())); - - const Node cv_node = GetCustomVariable(bindless_cv); - Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op)); - const std::size_t amend_index = DeclareAmend(std::move(amend_op)); - AmendNodeCv(amend_index, code[cursor]); - // TODO Implement Bindless Index custom variable - auto track = MakeTrackSampler(cbuf->GetIndex(), - offset_inm->GetValue(), bindless_cv); - return {tracked, track}; } return {}; } @@ -139,6 +127,26 @@ std::tuple ShaderIR::TrackBindlessSampler(Node tracked, cons return {}; } +std::pair ShaderIR::HandleBindlessIndirectRead( + const CbufNode& cbuf, const OperationNode& operation, Node gpr, Node base_offset, Node tracked, + const NodeBlock& code, s64 cursor) { + const auto offset_imm = std::get(*base_offset); + const auto& gpu_driver = registry.AccessGuestDriverProfile(); + const u32 bindless_cv = NewCustomVariable(); + const u32 texture_handler_size = gpu_driver.GetTextureHandlerSize(); + Node op = Operation(OperationCode::UDiv, gpr, Immediate(texture_handler_size)); + + Node cv_node = GetCustomVariable(bindless_cv); + Node amend_op = Operation(OperationCode::Assign, std::move(cv_node), std::move(op)); + const std::size_t amend_index = DeclareAmend(std::move(amend_op)); + AmendNodeCv(amend_index, code[cursor]); + + // TODO: Implement bindless index custom variable + auto track = + MakeTrackSampler(cbuf.GetIndex(), offset_imm.GetValue(), bindless_cv); + return {tracked, track}; +} + std::tuple ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const { if (const auto cbuf = std::get_if(&*tracked)) { From 5b2b6d594c6cfa77c3fb92faee63ad524bfe7204 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 4 Jun 2020 23:03:49 -0300 Subject: [PATCH 041/145] shader/texture: Join separate image and sampler pairs offline Games using D3D idioms can join images and samplers when a shader executes, instead of baking them into a combined sampler image. This is also possible on Vulkan. One approach to this solution would be to use separate samplers on Vulkan and leave this unimplemented on OpenGL, but we can't do this because there's no consistent way of determining which constant buffer holds a sampler and which one an image. We could in theory find the first bit and if it's in the TIC area, it's an image; but this falls apart when an image or sampler handle use an index of zero. The used approach is to track for a LOP.OR operation (this is done at an IR level, not at an ISA level), track again the constant buffers used as source and store this pair. Then, outside of shader execution, join the sample and image pair with a bitwise or operation. This approach won't work on games that truly use separate samplers in a meaningful way. For example, pooling textures in a 2D array and determining at runtime what sampler to use. This invalidates OpenGL's disk shader cache :) - Used mostly by D3D ports to Switch --- .../engines/const_buffer_engine_interface.h | 1 + src/video_core/engines/kepler_compute.cpp | 5 +- src/video_core/engines/kepler_compute.h | 2 + src/video_core/engines/maxwell_3d.cpp | 5 +- src/video_core/engines/maxwell_3d.h | 2 + .../renderer_opengl/gl_rasterizer.cpp | 18 ++++- .../renderer_opengl/gl_shader_disk_cache.cpp | 64 ++++++++++++---- .../renderer_opengl/gl_shader_disk_cache.h | 1 + .../renderer_vulkan/vk_rasterizer.cpp | 11 +++ src/video_core/shader/decode/texture.cpp | 55 +++++++++----- src/video_core/shader/node.h | 75 ++++++++----------- src/video_core/shader/node_helper.h | 2 +- src/video_core/shader/registry.cpp | 20 +++++ src/video_core/shader/registry.h | 35 +++++++++ src/video_core/shader/shader_ir.h | 4 +- src/video_core/shader/track.cpp | 24 ++++-- 16 files changed, 235 insertions(+), 89 deletions(-) diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h index ebe1395046..f46e81bb7c 100644 --- a/src/video_core/engines/const_buffer_engine_interface.h +++ b/src/video_core/engines/const_buffer_engine_interface.h @@ -93,6 +93,7 @@ public: virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0; virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const = 0; + virtual SamplerDescriptor AccessSampler(u32 handle) const = 0; virtual u32 GetBoundBuffer() const = 0; virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0; diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index f6237fc6a2..a82b06a382 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -92,8 +92,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con ASSERT(stage == ShaderType::Compute); const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer]; const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset; + return AccessSampler(memory_manager.Read(tex_info_address)); +} - const Texture::TextureHandle tex_handle{memory_manager.Read(tex_info_address)}; +SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const { + const Texture::TextureHandle tex_handle{handle}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index 18ceedfaf5..b7f668d887 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -219,6 +219,8 @@ public: SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const override; + SamplerDescriptor AccessSampler(u32 handle) const override; + u32 GetBoundBuffer() const override { return regs.tex_cb_index; } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 13ef2e42db..d539ae7b18 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -743,8 +743,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b const auto& shader = state.shader_stages[static_cast(stage)]; const auto& tex_info_buffer = shader.const_buffers[const_buffer]; const GPUVAddr tex_info_address = tex_info_buffer.address + offset; + return AccessSampler(memory_manager.Read(tex_info_address)); +} - const Texture::TextureHandle tex_handle{memory_manager.Read(tex_info_address)}; +SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const { + const Texture::TextureHandle tex_handle{handle}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 05dd6b39bf..c5f1b04999 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1403,6 +1403,8 @@ public: SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const override; + SamplerDescriptor AccessSampler(u32 handle) const override; + u32 GetBoundBuffer() const override { return regs.tex_cb_index; } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 55e79aaf65..aedcdcb783 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -65,10 +65,22 @@ constexpr std::size_t NumSupportedVertexAttributes = 16; template Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, ShaderType shader_type, std::size_t index = 0) { - if (entry.is_bindless) { - const auto tex_handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); - return engine.GetTextureInfo(tex_handle); + if constexpr (std::is_same_v) { + if (entry.is_separated) { + const u32 buffer_1 = entry.buffer; + const u32 buffer_2 = entry.secondary_buffer; + const u32 offset_1 = entry.offset; + const u32 offset_2 = entry.secondary_offset; + const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1); + const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2); + return engine.GetTextureInfo(handle_1 | handle_2); + } } + if (entry.is_bindless) { + const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); + return engine.GetTextureInfo(handle); + } + const auto& gpu_profile = engine.AccessGuestDriverProfile(); const u32 offset = entry.offset + static_cast(index * gpu_profile.GetTextureHandlerSize()); if constexpr (std::is_same_v) { diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 9e95a122b2..653c3f2f96 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap; namespace { +using VideoCommon::Shader::SeparateSamplerKey; + using ShaderCacheVersionHash = std::array; struct ConstBufferKey { @@ -37,18 +39,26 @@ struct ConstBufferKey { u32 value = 0; }; -struct BoundSamplerKey { +struct BoundSamplerEntry { u32 offset = 0; Tegra::Engines::SamplerDescriptor sampler; }; -struct BindlessSamplerKey { +struct SeparateSamplerEntry { + u32 cbuf1 = 0; + u32 cbuf2 = 0; + u32 offset1 = 0; + u32 offset2 = 0; + Tegra::Engines::SamplerDescriptor sampler; +}; + +struct BindlessSamplerEntry { u32 cbuf = 0; u32 offset = 0; Tegra::Engines::SamplerDescriptor sampler; }; -constexpr u32 NativeVersion = 20; +constexpr u32 NativeVersion = 21; ShaderCacheVersionHash GetShaderCacheVersionHash() { ShaderCacheVersionHash hash{}; @@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { u32 texture_handler_size_value; u32 num_keys; u32 num_bound_samplers; + u32 num_separate_samplers; u32 num_bindless_samplers; if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 || file.ReadArray(&is_texture_handler_size_known, 1) != 1 || file.ReadArray(&texture_handler_size_value, 1) != 1 || file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 || + file.ReadArray(&num_separate_samplers, 1) != 1 || file.ReadArray(&num_bindless_samplers, 1) != 1) { return false; } @@ -101,23 +113,32 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { } std::vector flat_keys(num_keys); - std::vector flat_bound_samplers(num_bound_samplers); - std::vector flat_bindless_samplers(num_bindless_samplers); + std::vector flat_bound_samplers(num_bound_samplers); + std::vector flat_separate_samplers(num_separate_samplers); + std::vector flat_bindless_samplers(num_bindless_samplers); if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() || file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) != flat_bound_samplers.size() || + file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) != + flat_separate_samplers.size() || file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) != flat_bindless_samplers.size()) { return false; } - for (const auto& key : flat_keys) { - keys.insert({{key.cbuf, key.offset}, key.value}); + for (const auto& entry : flat_keys) { + keys.insert({{entry.cbuf, entry.offset}, entry.value}); } - for (const auto& key : flat_bound_samplers) { - bound_samplers.emplace(key.offset, key.sampler); + for (const auto& entry : flat_bound_samplers) { + bound_samplers.emplace(entry.offset, entry.sampler); } - for (const auto& key : flat_bindless_samplers) { - bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); + for (const auto& entry : flat_separate_samplers) { + SeparateSamplerKey key; + key.buffers = {entry.cbuf1, entry.cbuf2}; + key.offsets = {entry.offset1, entry.offset2}; + separate_samplers.emplace(key, entry.sampler); + } + for (const auto& entry : flat_bindless_samplers) { + bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler}); } return true; @@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 || file.WriteObject(static_cast(keys.size())) != 1 || file.WriteObject(static_cast(bound_samplers.size())) != 1 || + file.WriteObject(static_cast(separate_samplers.size())) != 1 || file.WriteObject(static_cast(bindless_samplers.size())) != 1) { return false; } @@ -152,22 +174,34 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { flat_keys.push_back(ConstBufferKey{address.first, address.second, value}); } - std::vector flat_bound_samplers; + std::vector flat_bound_samplers; flat_bound_samplers.reserve(bound_samplers.size()); for (const auto& [address, sampler] : bound_samplers) { - flat_bound_samplers.push_back(BoundSamplerKey{address, sampler}); + flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler}); } - std::vector flat_bindless_samplers; + std::vector flat_separate_samplers; + flat_separate_samplers.reserve(separate_samplers.size()); + for (const auto& [key, sampler] : separate_samplers) { + SeparateSamplerEntry entry; + std::tie(entry.cbuf1, entry.cbuf2) = key.buffers; + std::tie(entry.offset1, entry.offset2) = key.offsets; + entry.sampler = sampler; + flat_separate_samplers.push_back(entry); + } + + std::vector flat_bindless_samplers; flat_bindless_samplers.reserve(bindless_samplers.size()); for (const auto& [address, sampler] : bindless_samplers) { flat_bindless_samplers.push_back( - BindlessSamplerKey{address.first, address.second, sampler}); + BindlessSamplerEntry{address.first, address.second, sampler}); } return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() && file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) == flat_bound_samplers.size() && + file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) == + flat_separate_samplers.size() && file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) == flat_bindless_samplers.size(); } diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index d5be52e401..a79cef0e9c 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -57,6 +57,7 @@ struct ShaderDiskCacheEntry { VideoCommon::Shader::ComputeInfo compute_info; VideoCommon::Shader::KeyMap keys; VideoCommon::Shader::BoundSamplerMap bound_samplers; + VideoCommon::Shader::SeparateSamplerMap separate_samplers; VideoCommon::Shader::BindlessSamplerMap bindless_samplers; }; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index a3d992ed3f..42a20530d4 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -117,6 +117,17 @@ template Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, std::size_t stage, std::size_t index = 0) { const auto stage_type = static_cast(stage); + if constexpr (std::is_same_v) { + if (entry.is_separated) { + const u32 buffer_1 = entry.buffer; + const u32 buffer_2 = entry.secondary_buffer; + const u32 offset_1 = entry.offset; + const u32 offset_2 = entry.secondary_offset; + const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1); + const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2); + return engine.GetTextureInfo(handle_1 | handle_2); + } + } if (entry.is_bindless) { const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset); return engine.GetTextureInfo(tex_handle); diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 8f0bb996ec..29ebf65bac 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -357,13 +357,11 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { return pc; } -ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset, - std::optional buffer) { +ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo( + SamplerInfo info, std::optional sampler) { if (info.IsComplete()) { return info; } - const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset) - : registry.ObtainBoundSampler(offset); if (!sampler) { LOG_WARNING(HW_GPU, "Unknown sampler info"); info.type = info.type.value_or(Tegra::Shader::TextureType::Texture2D); @@ -381,8 +379,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset, std::optional ShaderIR::GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo sampler_info) { - const auto offset = static_cast(sampler.index.Value()); - const auto info = GetSamplerInfo(sampler_info, offset); + const u32 offset = static_cast(sampler.index.Value()); + const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset)); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), @@ -404,20 +402,19 @@ std::optional ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, const Node sampler_register = GetRegister(reg); const auto [base_node, tracked_sampler_info] = TrackBindlessSampler(sampler_register, global_code, static_cast(global_code.size())); - ASSERT(base_node != nullptr); - if (base_node == nullptr) { + if (!base_node) { + UNREACHABLE(); return std::nullopt; } - if (const auto bindless_sampler_info = - std::get_if(&*tracked_sampler_info)) { - const u32 buffer = bindless_sampler_info->GetIndex(); - const u32 offset = bindless_sampler_info->GetOffset(); - info = GetSamplerInfo(info, offset, buffer); + if (const auto sampler_info = std::get_if(&*tracked_sampler_info)) { + const u32 buffer = sampler_info->index; + const u32 offset = sampler_info->offset; + info = GetSamplerInfo(info, registry.ObtainBindlessSampler(buffer, offset)); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), - [buffer = buffer, offset = offset](const Sampler& entry) { + [buffer, offset](const Sampler& entry) { return entry.buffer == buffer && entry.offset == offset; }); if (it != used_samplers.end()) { @@ -431,10 +428,32 @@ std::optional ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, return used_samplers.emplace_back(next_index, offset, buffer, *info.type, *info.is_array, *info.is_shadow, *info.is_buffer, false); } - if (const auto array_sampler_info = std::get_if(&*tracked_sampler_info)) { - const u32 base_offset = array_sampler_info->GetBaseOffset() / 4; - index_var = GetCustomVariable(array_sampler_info->GetIndexVar()); - info = GetSamplerInfo(info, base_offset); + if (const auto sampler_info = std::get_if(&*tracked_sampler_info)) { + const std::pair indices = sampler_info->indices; + const std::pair offsets = sampler_info->offsets; + info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets)); + + // Try to use an already created sampler if it exists + const auto it = std::find_if( + used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) { + return offsets == std::pair{entry.offset, entry.secondary_offset} && + indices == std::pair{entry.buffer, entry.secondary_buffer}; + }); + if (it != used_samplers.end()) { + ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array && + it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer); + return *it; + } + + // Otherwise create a new mapping for this sampler + const u32 next_index = static_cast(used_samplers.size()); + return used_samplers.emplace_back(next_index, offsets, indices, *info.type, *info.is_array, + *info.is_shadow, *info.is_buffer); + } + if (const auto sampler_info = std::get_if(&*tracked_sampler_info)) { + const u32 base_offset = sampler_info->base_offset / 4; + index_var = GetCustomVariable(sampler_info->bindless_var); + info = GetSamplerInfo(info, registry.ObtainBoundSampler(base_offset)); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if( diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index c5e5165ff8..8f230d57a6 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -275,10 +275,11 @@ using Node = std::shared_ptr; using Node4 = std::array; using NodeBlock = std::vector; -class BindlessSamplerNode; -class ArraySamplerNode; +struct ArraySamplerNode; +struct BindlessSamplerNode; +struct SeparateSamplerNode; -using TrackSamplerData = std::variant; +using TrackSamplerData = std::variant; using TrackSampler = std::shared_ptr; struct Sampler { @@ -288,63 +289,51 @@ struct Sampler { : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow}, is_buffer{is_buffer}, is_indexed{is_indexed} {} + /// Separate sampler constructor + constexpr explicit Sampler(u32 index, std::pair offsets, std::pair buffers, + Tegra::Shader::TextureType type, bool is_array, bool is_shadow, + bool is_buffer) + : index{index}, offset{offsets.first}, secondary_offset{offsets.second}, + buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array}, + is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {} + /// Bindless samplers constructor constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type, bool is_array, bool is_shadow, bool is_buffer, bool is_indexed) : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array}, is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {} - u32 index = 0; ///< Emulated index given for the this sampler. - u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read. - u32 buffer = 0; ///< Buffer where the bindless sampler is being read (unused on bound samplers). - u32 size = 1; ///< Size of the sampler. + u32 index = 0; ///< Emulated index given for the this sampler. + u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read. + u32 secondary_offset = 0; ///< Secondary offset in the const buffer. + u32 buffer = 0; ///< Buffer where the bindless sampler is read. + u32 secondary_buffer = 0; ///< Secondary buffer where the bindless sampler is read. + u32 size = 1; ///< Size of the sampler. Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc) - bool is_array = false; ///< Whether the texture is being sampled as an array texture or not. - bool is_shadow = false; ///< Whether the texture is being sampled as a depth texture or not. - bool is_buffer = false; ///< Whether the texture is a texture buffer without sampler. - bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not. - bool is_indexed = false; ///< Whether this sampler is an indexed array of textures. + bool is_array = false; ///< Whether the texture is being sampled as an array texture or not. + bool is_shadow = false; ///< Whether the texture is being sampled as a depth texture or not. + bool is_buffer = false; ///< Whether the texture is a texture buffer without sampler. + bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not. + bool is_indexed = false; ///< Whether this sampler is an indexed array of textures. + bool is_separated = false; ///< Whether the image and sampler is separated or not. }; /// Represents a tracked bindless sampler into a direct const buffer -class ArraySamplerNode final { -public: - explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var) - : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {} - - constexpr u32 GetIndex() const { - return index; - } - - constexpr u32 GetBaseOffset() const { - return base_offset; - } - - constexpr u32 GetIndexVar() const { - return bindless_var; - } - -private: +struct ArraySamplerNode { u32 index; u32 base_offset; u32 bindless_var; }; +/// Represents a tracked separate sampler image pair that was folded statically +struct SeparateSamplerNode { + std::pair indices; + std::pair offsets; +}; + /// Represents a tracked bindless sampler into a direct const buffer -class BindlessSamplerNode final { -public: - explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {} - - constexpr u32 GetIndex() const { - return index; - } - - constexpr u32 GetOffset() const { - return offset; - } - -private: +struct BindlessSamplerNode { u32 index; u32 offset; }; diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h index 11231bbea8..1e0886185d 100644 --- a/src/video_core/shader/node_helper.h +++ b/src/video_core/shader/node_helper.h @@ -48,7 +48,7 @@ Node MakeNode(Args&&... args) { template TrackSampler MakeTrackSampler(Args&&... args) { static_assert(std::is_convertible_v); - return std::make_shared(T(std::forward(args)...)); + return std::make_shared(T{std::forward(args)...}); } template diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp index af70b3f357..cdf274e544 100644 --- a/src/video_core/shader/registry.cpp +++ b/src/video_core/shader/registry.cpp @@ -93,6 +93,26 @@ std::optional Registry::ObtainBoundSampler(u32 offset) { return value; } +std::optional Registry::ObtainSeparateSampler( + std::pair buffers, std::pair offsets) { + SeparateSamplerKey key; + key.buffers = buffers; + key.offsets = offsets; + const auto iter = separate_samplers.find(key); + if (iter != separate_samplers.end()) { + return iter->second; + } + if (!engine) { + return std::nullopt; + } + + const u32 handle_1 = engine->AccessConstBuffer32(stage, key.buffers.first, key.offsets.first); + const u32 handle_2 = engine->AccessConstBuffer32(stage, key.buffers.second, key.offsets.second); + const SamplerDescriptor value = engine->AccessSampler(handle_1 | handle_2); + separate_samplers.emplace(key, value); + return value; +} + std::optional Registry::ObtainBindlessSampler(u32 buffer, u32 offset) { const std::pair key = {buffer, offset}; diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h index 0c80d35fda..2312067657 100644 --- a/src/video_core/shader/registry.h +++ b/src/video_core/shader/registry.h @@ -19,8 +19,39 @@ namespace VideoCommon::Shader { +struct SeparateSamplerKey { + std::pair buffers; + std::pair offsets; +}; + +} // namespace VideoCommon::Shader + +namespace std { + +template <> +struct hash { + std::size_t operator()(const VideoCommon::Shader::SeparateSamplerKey& key) const noexcept { + return std::hash{}(key.buffers.first ^ key.buffers.second ^ key.offsets.first ^ + key.offsets.second); + } +}; + +template <> +struct equal_to { + bool operator()(const VideoCommon::Shader::SeparateSamplerKey& lhs, + const VideoCommon::Shader::SeparateSamplerKey& rhs) const noexcept { + return lhs.buffers == rhs.buffers && lhs.offsets == rhs.offsets; + } +}; + +} // namespace std + +namespace VideoCommon::Shader { + using KeyMap = std::unordered_map, u32, Common::PairHash>; using BoundSamplerMap = std::unordered_map; +using SeparateSamplerMap = + std::unordered_map; using BindlessSamplerMap = std::unordered_map, Tegra::Engines::SamplerDescriptor, Common::PairHash>; @@ -73,6 +104,9 @@ public: std::optional ObtainBoundSampler(u32 offset); + std::optional ObtainSeparateSampler( + std::pair buffers, std::pair offsets); + std::optional ObtainBindlessSampler(u32 buffer, u32 offset); /// Inserts a key. @@ -128,6 +162,7 @@ private: Tegra::Engines::ConstBufferEngineInterface* engine = nullptr; KeyMap keys; BoundSamplerMap bound_samplers; + SeparateSamplerMap separate_samplers; BindlessSamplerMap bindless_samplers; u32 bound_buffer; GraphicsInfo graphics_info; diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 8f522edf0a..3a98b2104f 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -330,8 +330,8 @@ private: OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation); /// Queries the missing sampler info from the execution context. - SamplerInfo GetSamplerInfo(SamplerInfo info, u32 offset, - std::optional buffer = std::nullopt); + SamplerInfo GetSamplerInfo(SamplerInfo info, + std::optional sampler); /// Accesses a texture sampler. std::optional GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info); diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index 435f4facbc..d5ed81442f 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -64,7 +64,8 @@ bool AmendNodeCv(std::size_t amend_index, Node node) { if (const auto operation = std::get_if(&*node)) { operation->SetAmendIndex(amend_index); return true; - } else if (const auto conditional = std::get_if(&*node)) { + } + if (const auto conditional = std::get_if(&*node)) { conditional->SetAmendIndex(amend_index); return true; } @@ -110,10 +111,23 @@ std::pair ShaderIR::TrackBindlessSampler(Node tracked, const return TrackBindlessSampler(source, code, new_cursor); } if (const auto operation = std::get_if(&*tracked)) { - for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) { - if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor); - std::get<0>(found)) { - // Cbuf found in operand. + const OperationNode& op = *operation; + + const OperationCode opcode = operation->GetCode(); + if (opcode == OperationCode::IBitwiseOr || opcode == OperationCode::UBitwiseOr) { + ASSERT(op.GetOperandsCount() == 2); + auto [node_a, index_a, offset_a] = TrackCbuf(op[0], code, cursor); + auto [node_b, index_b, offset_b] = TrackCbuf(op[1], code, cursor); + if (node_a && node_b) { + auto track = MakeTrackSampler(std::pair{index_a, index_b}, + std::pair{offset_a, offset_b}); + return {tracked, std::move(track)}; + } + } + std::size_t i = op.GetOperandsCount(); + while (i--) { + if (auto found = TrackBindlessSampler(op[i - 1], code, cursor); std::get<0>(found)) { + // Constant buffer found in operand. return found; } } From 354fbe701e99c1b9cd0bb5463e9a15ca374c12d7 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 5 Jun 2020 21:18:22 -0300 Subject: [PATCH 042/145] renderer_opengl: Only enable DEBUG_OUTPUT when graphics debugging is enabled Avoids logging when it's not relevant. This can potentially reduce driver's internal thread overhead. --- src/video_core/renderer_opengl/renderer_opengl.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index e7952924af..6214fcbc3d 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -751,11 +751,9 @@ void RendererOpenGL::RenderScreenshot() { } bool RendererOpenGL::Init() { - if (GLAD_GL_KHR_debug) { + if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) { glEnable(GL_DEBUG_OUTPUT); - if (Settings::values.renderer_debug) { - glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); - } + glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); glDebugMessageCallback(DebugHandler, nullptr); } From e78d681a6cabc3a3a3380fadd06f98f7ff7cb665 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 6 Jun 2020 02:56:42 -0300 Subject: [PATCH 043/145] gl_device: Black list NVIDIA 443.24 for fast buffer uploads Skip fast buffer uploads on Nvidia 443.24 Vulkan beta driver on OpenGL. This driver throws the following error when calling BufferSubData or BufferData on buffers that are candidates for fast constant buffer uploads. This is the equivalens to push constants on Vulkan, except that they can access the full buffer. The error: Unknown internal debug message. The NVIDIA OpenGL driver has encountered an out of memory error. This application might behave inconsistently and fail. If this error persists on future drivers, we might have to look deeper into this issue. For now, we can black list it and log it as a temporary solution. --- src/video_core/renderer_opengl/gl_device.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index b772c37d90..a14641b97f 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -185,12 +185,20 @@ bool IsASTCSupported() { Device::Device() : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { const std::string_view vendor = reinterpret_cast(glGetString(GL_VENDOR)); - const auto renderer = reinterpret_cast(glGetString(GL_RENDERER)); + const std::string_view version = reinterpret_cast(glGetString(GL_VERSION)); const std::vector extensions = GetExtensions(); const bool is_nvidia = vendor == "NVIDIA Corporation"; const bool is_amd = vendor == "ATI Technologies Inc."; + bool disable_fast_buffer_sub_data = false; + if (is_nvidia && version == "4.6.0 NVIDIA 443.24") { + LOG_WARNING( + Render_OpenGL, + "Beta driver 443.24 is known to have issues. There might be performance issues."); + disable_fast_buffer_sub_data = true; + } + uniform_buffer_alignment = GetInteger(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); shader_storage_alignment = GetInteger(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); max_vertex_attributes = GetInteger(GL_MAX_VERTEX_ATTRIBS); @@ -204,7 +212,7 @@ Device::Device() has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = is_amd; has_precise_bug = TestPreciseBug(); - has_fast_buffer_sub_data = is_nvidia; + has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data; use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5; From 03fad5ebe86309cf22732ec9b8d3caf3da63f7b9 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Sat, 6 Jun 2020 15:56:14 -0400 Subject: [PATCH 044/145] yuzu/frontend: Remove internal resolution option --- src/core/settings.h | 2 +- src/yuzu/configuration/config.cpp | 10 +---- src/yuzu/configuration/configure_graphics.cpp | 45 ------------------- src/yuzu/configuration/configure_graphics.ui | 40 ----------------- src/yuzu/main.cpp | 5 +-- src/yuzu_cmd/config.cpp | 2 - src/yuzu_cmd/default_ini.h | 5 --- src/yuzu_tester/config.cpp | 2 - src/yuzu_tester/default_ini.h | 5 --- 9 files changed, 3 insertions(+), 113 deletions(-) diff --git a/src/core/settings.h b/src/core/settings.h index 78eb33737d..9d916d5cba 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -437,7 +437,7 @@ struct Values { bool renderer_debug; int vulkan_device; - float resolution_factor; + u16 resolution_factor{1}; int aspect_ratio; int max_anisotropy; bool use_frame_limit; diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index b08b874265..d6c9e50136 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -629,13 +629,11 @@ void Config::ReadRendererValues() { static_cast(ReadSetting(QStringLiteral("backend"), 0).toInt()); Settings::values.renderer_debug = ReadSetting(QStringLiteral("debug"), false).toBool(); Settings::values.vulkan_device = ReadSetting(QStringLiteral("vulkan_device"), 0).toInt(); - Settings::values.resolution_factor = - ReadSetting(QStringLiteral("resolution_factor"), 1.0).toFloat(); Settings::values.aspect_ratio = ReadSetting(QStringLiteral("aspect_ratio"), 0).toInt(); Settings::values.max_anisotropy = ReadSetting(QStringLiteral("max_anisotropy"), 0).toInt(); Settings::values.use_frame_limit = ReadSetting(QStringLiteral("use_frame_limit"), true).toBool(); - Settings::values.frame_limit = ReadSetting(QStringLiteral("frame_limit"), 100).toInt(); + Settings::values.frame_limit = ReadSetting(QStringLiteral("frame_limit"), 100).toUInt(); Settings::values.use_disk_shader_cache = ReadSetting(QStringLiteral("use_disk_shader_cache"), true).toBool(); const int gpu_accuracy_level = ReadSetting(QStringLiteral("gpu_accuracy"), 0).toInt(); @@ -720,8 +718,6 @@ void Config::ReadUIValues() { .toString(); UISettings::values.enable_discord_presence = ReadSetting(QStringLiteral("enable_discord_presence"), true).toBool(); - UISettings::values.screenshot_resolution_factor = - static_cast(ReadSetting(QStringLiteral("screenshot_resolution_factor"), 0).toUInt()); UISettings::values.select_user_on_boot = ReadSetting(QStringLiteral("select_user_on_boot"), false).toBool(); @@ -1079,8 +1075,6 @@ void Config::SaveRendererValues() { WriteSetting(QStringLiteral("backend"), static_cast(Settings::values.renderer_backend), 0); WriteSetting(QStringLiteral("debug"), Settings::values.renderer_debug, false); WriteSetting(QStringLiteral("vulkan_device"), Settings::values.vulkan_device, 0); - WriteSetting(QStringLiteral("resolution_factor"), - static_cast(Settings::values.resolution_factor), 1.0); WriteSetting(QStringLiteral("aspect_ratio"), Settings::values.aspect_ratio, 0); WriteSetting(QStringLiteral("max_anisotropy"), Settings::values.max_anisotropy, 0); WriteSetting(QStringLiteral("use_frame_limit"), Settings::values.use_frame_limit, true); @@ -1156,8 +1150,6 @@ void Config::SaveUIValues() { QString::fromUtf8(UISettings::themes[0].second)); WriteSetting(QStringLiteral("enable_discord_presence"), UISettings::values.enable_discord_presence, true); - WriteSetting(QStringLiteral("screenshot_resolution_factor"), - UISettings::values.screenshot_resolution_factor, 0); WriteSetting(QStringLiteral("select_user_on_boot"), UISettings::values.select_user_on_boot, false); diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp index ea667caefa..304625cd7d 100644 --- a/src/yuzu/configuration/configure_graphics.cpp +++ b/src/yuzu/configuration/configure_graphics.cpp @@ -19,47 +19,6 @@ #include "video_core/renderer_vulkan/renderer_vulkan.h" #endif -namespace { -enum class Resolution : int { - Auto, - Scale1x, - Scale2x, - Scale3x, - Scale4x, -}; - -float ToResolutionFactor(Resolution option) { - switch (option) { - case Resolution::Auto: - return 0.f; - case Resolution::Scale1x: - return 1.f; - case Resolution::Scale2x: - return 2.f; - case Resolution::Scale3x: - return 3.f; - case Resolution::Scale4x: - return 4.f; - } - return 0.f; -} - -Resolution FromResolutionFactor(float factor) { - if (factor == 0.f) { - return Resolution::Auto; - } else if (factor == 1.f) { - return Resolution::Scale1x; - } else if (factor == 2.f) { - return Resolution::Scale2x; - } else if (factor == 3.f) { - return Resolution::Scale3x; - } else if (factor == 4.f) { - return Resolution::Scale4x; - } - return Resolution::Auto; -} -} // Anonymous namespace - ConfigureGraphics::ConfigureGraphics(QWidget* parent) : QWidget(parent), ui(new Ui::ConfigureGraphics) { vulkan_device = Settings::values.vulkan_device; @@ -99,8 +58,6 @@ void ConfigureGraphics::SetConfiguration() { ui->api->setEnabled(runtime_lock); ui->api->setCurrentIndex(static_cast(Settings::values.renderer_backend)); - ui->resolution_factor_combobox->setCurrentIndex( - static_cast(FromResolutionFactor(Settings::values.resolution_factor))); ui->aspect_ratio_combobox->setCurrentIndex(Settings::values.aspect_ratio); ui->use_disk_shader_cache->setEnabled(runtime_lock); ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache); @@ -114,8 +71,6 @@ void ConfigureGraphics::SetConfiguration() { void ConfigureGraphics::ApplyConfiguration() { Settings::values.renderer_backend = GetCurrentGraphicsBackend(); Settings::values.vulkan_device = vulkan_device; - Settings::values.resolution_factor = - ToResolutionFactor(static_cast(ui->resolution_factor_combobox->currentIndex())); Settings::values.aspect_ratio = ui->aspect_ratio_combobox->currentIndex(); Settings::values.use_disk_shader_cache = ui->use_disk_shader_cache->isChecked(); Settings::values.use_asynchronous_gpu_emulation = diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui index c816d6108a..6e75447a5b 100644 --- a/src/yuzu/configuration/configure_graphics.ui +++ b/src/yuzu/configuration/configure_graphics.ui @@ -84,46 +84,6 @@ - - - - - - Internal Resolution: - - - - - - - - Auto (Window Size) - - - - - Native (1280x720) - - - - - 2x Native (2560x1440) - - - - - 3x Native (3840x2160) - - - - - 4x Native (5120x2880) - - - - - - diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index 270cccc772..4119d79077 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -689,10 +689,7 @@ void GMainWindow::InitializeHotkeys() { Settings::values.use_frame_limit = !Settings::values.use_frame_limit; UpdateStatusBar(); }); - // TODO: Remove this comment/static whenever the next major release of - // MSVC occurs and we make it a requirement (see: - // https://developercommunity.visualstudio.com/content/problem/93922/constexprs-are-trying-to-be-captured-in-lambda-fun.html) - static constexpr u16 SPEED_LIMIT_STEP = 5; + constexpr u16 SPEED_LIMIT_STEP = 5; connect(hotkey_registry.GetHotkey(main_window, QStringLiteral("Increase Speed Limit"), this), &QShortcut::activated, this, [&] { if (Settings::values.frame_limit < 9999 - SPEED_LIMIT_STEP) { diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index c20d48c429..5f9cc158eb 100644 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -380,8 +380,6 @@ void Config::ReadValues() { Settings::values.renderer_debug = sdl2_config->GetBoolean("Renderer", "debug", false); Settings::values.vulkan_device = sdl2_config->GetInteger("Renderer", "vulkan_device", 0); - Settings::values.resolution_factor = - static_cast(sdl2_config->GetReal("Renderer", "resolution_factor", 1.0)); Settings::values.aspect_ratio = static_cast(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0)); Settings::values.max_anisotropy = diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h index abc6e6e656..1025020844 100644 --- a/src/yuzu_cmd/default_ini.h +++ b/src/yuzu_cmd/default_ini.h @@ -117,11 +117,6 @@ use_hw_renderer = # 0: Interpreter (slow), 1 (default): JIT (fast) use_shader_jit = -# Resolution scale factor -# 0: Auto (scales resolution to window size), 1: Native Switch screen resolution, Otherwise a scale -# factor for the Switch resolution -resolution_factor = - # Aspect ratio # 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window aspect_ratio = diff --git a/src/yuzu_tester/config.cpp b/src/yuzu_tester/config.cpp index 3be58b15da..1566c2e3f7 100644 --- a/src/yuzu_tester/config.cpp +++ b/src/yuzu_tester/config.cpp @@ -116,8 +116,6 @@ void Config::ReadValues() { Settings::values.use_multi_core = sdl2_config->GetBoolean("Core", "use_multi_core", false); // Renderer - Settings::values.resolution_factor = - static_cast(sdl2_config->GetReal("Renderer", "resolution_factor", 1.0)); Settings::values.aspect_ratio = static_cast(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0)); Settings::values.max_anisotropy = diff --git a/src/yuzu_tester/default_ini.h b/src/yuzu_tester/default_ini.h index ca203b64db..41bbbbf605 100644 --- a/src/yuzu_tester/default_ini.h +++ b/src/yuzu_tester/default_ini.h @@ -21,11 +21,6 @@ use_hw_renderer = # 0: Interpreter (slow), 1 (default): JIT (fast) use_shader_jit = -# Resolution scale factor -# 0: Auto (scales resolution to window size), 1: Native Switch screen resolution, Otherwise a scale -# factor for the Switch resolution -resolution_factor = - # Aspect ratio # 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window aspect_ratio = From dc27252352a27f466c65e9008b3d56a2540ce224 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 22 May 2020 20:53:27 -0300 Subject: [PATCH 045/145] shader_cache: Implement a generic shader cache Implement a generic shader cache for fast lookups and invalidations. Invalidations are cheap but expensive when a shader is invalidated. Use two mutexes instead of one to avoid locking invalidations for lookups and vice versa. When a shader has to be removed, lookups are locked as expected. --- src/video_core/CMakeLists.txt | 1 + src/video_core/shader_cache.h | 228 ++++++++++++++++++++++++++++++++++ 2 files changed, 229 insertions(+) create mode 100644 src/video_core/shader_cache.h diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 2bf8d68ce0..cf1885f92b 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -93,6 +93,7 @@ add_library(video_core STATIC renderer_opengl/utils.h sampler_cache.cpp sampler_cache.h + shader_cache.h shader/decode/arithmetic.cpp shader/decode/arithmetic_immediate.cpp shader/decode/bfe.cpp diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h new file mode 100644 index 0000000000..a23c238868 --- /dev/null +++ b/src/video_core/shader_cache.h @@ -0,0 +1,228 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/rasterizer_interface.h" + +namespace VideoCommon { + +template +class ShaderCache { + static constexpr u64 PAGE_SHIFT = 14; + + struct Entry { + VAddr addr_start; + VAddr addr_end; + T* data; + + bool is_memory_marked = true; + + constexpr bool Overlaps(VAddr start, VAddr end) const noexcept { + return start < addr_end && addr_start < end; + } + }; + +public: + virtual ~ShaderCache() = default; + + /// @brief Removes shaders inside a given region + /// @note Checks for ranges + /// @param addr Start address of the invalidation + /// @param size Number of bytes of the invalidation + void InvalidateRegion(VAddr addr, std::size_t size) { + std::scoped_lock lock{invalidation_mutex}; + InvalidatePagesInRegion(addr, size); + RemovePendingShaders(); + } + + /// @brief Unmarks a memory region as cached and marks it for removal + /// @param addr Start address of the CPU write operation + /// @param size Number of bytes of the CPU write operation + void OnCPUWrite(VAddr addr, std::size_t size) { + std::lock_guard lock{invalidation_mutex}; + InvalidatePagesInRegion(addr, size); + } + + /// @brief Flushes delayed removal operations + void SyncGuestHost() { + std::scoped_lock lock{invalidation_mutex}; + RemovePendingShaders(); + } + + /// @brief Tries to obtain a cached shader starting in a given address + /// @note Doesn't check for ranges, the given address has to be the start of the shader + /// @param addr Start address of the shader, this doesn't cache for region + /// @return Pointer to a valid shader, nullptr when nothing is found + T* TryGet(VAddr addr) const { + std::scoped_lock lock{lookup_mutex}; + + const auto it = lookup_cache.find(addr); + if (it == lookup_cache.end()) { + return nullptr; + } + return it->second->data; + } + +protected: + explicit ShaderCache(VideoCore::RasterizerInterface& rasterizer_) : rasterizer{rasterizer_} {} + + /// @brief Register in the cache a given entry + /// @param data Shader to store in the cache + /// @param addr Start address of the shader that will be registered + /// @param size Size in bytes of the shader + void Register(std::unique_ptr data, VAddr addr, std::size_t size) { + std::scoped_lock lock{invalidation_mutex, lookup_mutex}; + + const VAddr addr_end = addr + size; + Entry* const entry = NewEntry(addr, addr_end, data.get()); + + const u64 page_end = addr_end >> PAGE_SHIFT; + for (u64 page = addr >> PAGE_SHIFT; page <= page_end; ++page) { + invalidation_cache[page].push_back(entry); + } + + storage.push_back(std::move(data)); + + rasterizer.UpdatePagesCachedCount(addr, size, 1); + } + + /// @brief Called when a shader is going to be removed + /// @param shader Shader that will be removed + /// @pre invalidation_cache is locked + /// @pre lookup_mutex is locked + virtual void OnShaderRemoval([[maybe_unused]] T* shader) {} + +private: + /// @brief Invalidate pages in a given region + /// @pre invalidation_mutex is locked + void InvalidatePagesInRegion(VAddr addr, std::size_t size) { + const VAddr addr_end = addr + size; + const u64 page_end = addr_end >> PAGE_SHIFT; + for (u64 page = addr >> PAGE_SHIFT; page <= page_end; ++page) { + const auto it = invalidation_cache.find(page); + if (it == invalidation_cache.end()) { + continue; + } + + std::vector& entries = it->second; + InvalidatePageEntries(entries, addr, addr_end); + + // If there's nothing else in this page, remove it to avoid overpopulating the hash map. + if (entries.empty()) { + invalidation_cache.erase(it); + } + } + } + + /// @brief Remove shaders marked for deletion + /// @pre invalidation_mutex is locked + void RemovePendingShaders() { + if (marked_for_removal.empty()) { + return; + } + std::scoped_lock lock{lookup_mutex}; + + std::vector removed_shaders; + removed_shaders.reserve(marked_for_removal.size()); + + for (Entry* const entry : marked_for_removal) { + if (lookup_cache.erase(entry->addr_start) > 0) { + removed_shaders.push_back(entry->data); + } + } + marked_for_removal.clear(); + + if (!removed_shaders.empty()) { + RemoveShadersFromStorage(std::move(removed_shaders)); + } + } + + /// @brief Invalidates entries in a given range for the passed page + /// @param entries Vector of entries in the page, it will be modified on overlaps + /// @param addr Start address of the invalidation + /// @param addr_end Non-inclusive end address of the invalidation + /// @pre invalidation_mutex is locked + void InvalidatePageEntries(std::vector& entries, VAddr addr, VAddr addr_end) { + auto it = entries.begin(); + while (it != entries.end()) { + Entry* const entry = *it; + if (!entry->Overlaps(addr, addr_end)) { + ++it; + continue; + } + UnmarkMemory(entry); + marked_for_removal.push_back(entry); + + it = entries.erase(it); + } + } + + /// @brief Unmarks an entry from the rasterizer cache + /// @param entry Entry to unmark from memory + void UnmarkMemory(Entry* entry) { + if (!entry->is_memory_marked) { + return; + } + entry->is_memory_marked = false; + + const VAddr addr = entry->addr_start; + const std::size_t size = entry->addr_end - addr; + rasterizer.UpdatePagesCachedCount(addr, size, -1); + } + + /// @brief Removes a vector of shaders from a list + /// @param removed_shaders Shaders to be removed from the storage, it can contain duplicates + /// @pre invalidation_mutex is locked + /// @pre lookup_mutex is locked + void RemoveShadersFromStorage(std::vector removed_shaders) { + // Remove duplicates + std::sort(removed_shaders.begin(), removed_shaders.end()); + removed_shaders.erase(std::unique(removed_shaders.begin(), removed_shaders.end()), + removed_shaders.end()); + + // Now that there are no duplicates, we can notify removals + for (T* const shader : removed_shaders) { + OnShaderRemoval(shader); + } + + // Remove them from the cache + const auto is_removed = [&removed_shaders](std::unique_ptr& shader) { + return std::find(removed_shaders.begin(), removed_shaders.end(), shader.get()) != + removed_shaders.end(); + }; + storage.erase(std::remove_if(storage.begin(), storage.end(), is_removed), storage.end()); + } + + /// @brief Creates a new entry in the lookup cache and returns its pointer + /// @pre lookup_mutex is locked + Entry* NewEntry(VAddr addr, VAddr addr_end, T* data) { + auto entry = std::make_unique(Entry{addr, addr_end, data}); + Entry* const entry_pointer = entry.get(); + + lookup_cache.emplace(addr, std::move(entry)); + return entry_pointer; + } + + VideoCore::RasterizerInterface& rasterizer; + + mutable std::mutex lookup_mutex; + std::mutex invalidation_mutex; + + std::unordered_map> lookup_cache; + std::unordered_map> invalidation_cache; + std::vector> storage; + std::vector marked_for_removal; +}; + +} // namespace VideoCommon From b96f65b62b143544716f66a65e93d2dcb141ee2b Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 22 May 2020 20:55:38 -0300 Subject: [PATCH 046/145] gl_shader_cache: Use generic shader cache Trivially port the generic shader cache to OpenGL. --- .../renderer_opengl/gl_rasterizer.cpp | 19 ++-- .../renderer_opengl/gl_rasterizer.h | 16 ++-- .../renderer_opengl/gl_shader_cache.cpp | 87 +++++++++---------- .../renderer_opengl/gl_shader_cache.h | 51 +++++------ 4 files changed, 80 insertions(+), 93 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 55e79aaf65..909ca9e0e4 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -30,6 +30,7 @@ #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" +#include "video_core/shader_cache.h" namespace OpenGL { @@ -282,7 +283,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { continue; } - Shader shader{shader_cache.GetStageProgram(program)}; + Shader* const shader = shader_cache.GetStageProgram(program); if (device.UseAssemblyShaders()) { // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this @@ -842,7 +843,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, return true; } -void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) { static constexpr std::array PARAMETER_LUT = { GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, @@ -872,7 +873,7 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad } } -void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) { MICROPROFILE_SCOPE(OpenGL_UBO); const auto& launch_desc = system.GPU().KeplerCompute().launch_description; const auto& entries = kernel->GetEntries(); @@ -941,7 +942,7 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, } } -void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) { auto& gpu{system.GPU()}; auto& memory_manager{gpu.MemoryManager()}; const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; @@ -956,7 +957,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad } } -void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) { auto& gpu{system.GPU()}; auto& memory_manager{gpu.MemoryManager()}; const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; @@ -979,7 +980,7 @@ void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& e static_cast(size)); } -void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) { MICROPROFILE_SCOPE(OpenGL_Texture); const auto& maxwell3d = system.GPU().Maxwell3D(); u32 binding = device.GetBaseBindings(stage_index).sampler; @@ -992,7 +993,7 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& } } -void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) { MICROPROFILE_SCOPE(OpenGL_Texture); const auto& compute = system.GPU().KeplerCompute(); u32 binding = 0; @@ -1021,7 +1022,7 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu } } -void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) { const auto& maxwell3d = system.GPU().Maxwell3D(); u32 binding = device.GetBaseBindings(stage_index).image; for (const auto& entry : shader->GetEntries().images) { @@ -1031,7 +1032,7 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh } } -void RasterizerOpenGL::SetupComputeImages(const Shader& shader) { +void RasterizerOpenGL::SetupComputeImages(Shader* shader) { const auto& compute = system.GPU().KeplerCompute(); u32 binding = 0; for (const auto& entry : shader->GetEntries().images) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index f5dc56a0e2..1f8d45eac6 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -100,10 +100,10 @@ private: void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil); /// Configures the current constbuffers to use for the draw command. - void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader); + void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader); /// Configures the current constbuffers to use for the kernel invocation. - void SetupComputeConstBuffers(const Shader& kernel); + void SetupComputeConstBuffers(Shader* kernel); /// Configures a constant buffer. void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, @@ -111,30 +111,30 @@ private: std::size_t unified_offset); /// Configures the current global memory entries to use for the draw command. - void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); + void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader); /// Configures the current global memory entries to use for the kernel invocation. - void SetupComputeGlobalMemory(const Shader& kernel); + void SetupComputeGlobalMemory(Shader* kernel); /// Configures a constant buffer. void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, std::size_t size); /// Configures the current textures to use for the draw command. - void SetupDrawTextures(std::size_t stage_index, const Shader& shader); + void SetupDrawTextures(std::size_t stage_index, Shader* shader); /// Configures the textures used in a compute shader. - void SetupComputeTextures(const Shader& kernel); + void SetupComputeTextures(Shader* kernel); /// Configures a texture. void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry); /// Configures images in a graphics shader. - void SetupDrawImages(std::size_t stage_index, const Shader& shader); + void SetupDrawImages(std::size_t stage_index, Shader* shader); /// Configures images in a compute shader. - void SetupComputeImages(const Shader& shader); + void SetupComputeImages(Shader* shader); /// Configures an image. void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index a991ca64a7..7c6797e02f 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -29,6 +29,7 @@ #include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h" namespace OpenGL { @@ -194,12 +195,9 @@ std::unordered_set GetSupportedFormats() { } // Anonymous namespace -CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, - std::shared_ptr registry, - ShaderEntries entries, ProgramSharedPtr program_) - : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)}, - size_in_bytes{size_in_bytes}, program{std::move(program_)} { - // Assign either the assembly program or source program. We can't have both. +Shader::Shader(std::shared_ptr registry, ShaderEntries entries, + ProgramSharedPtr program) + : registry{std::move(registry)}, entries{std::move(entries)}, program{std::move(program)} { handle = program->assembly_program.handle; if (handle == 0) { handle = program->source_program.handle; @@ -207,16 +205,16 @@ CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, ASSERT(handle != 0); } -CachedShader::~CachedShader() = default; +Shader::~Shader() = default; -GLuint CachedShader::GetHandle() const { +GLuint Shader::GetHandle() const { DEBUG_ASSERT(registry->IsConsistent()); return handle; } -Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, ProgramCode code, - ProgramCode code_b) { +std::unique_ptr Shader::CreateStageFromMemory(const ShaderParameters& params, + Maxwell::ShaderProgram program_type, + ProgramCode code, ProgramCode code_b) { const auto shader_type = GetShaderType(program_type); const std::size_t size_in_bytes = code.size() * sizeof(u64); @@ -241,12 +239,12 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, entry.bindless_samplers = registry->GetBindlessSamplers(); params.disk_cache.SaveEntry(std::move(entry)); - return std::shared_ptr( - new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), - MakeEntries(params.device, ir, shader_type), std::move(program))); + return std::unique_ptr(new Shader( + std::move(registry), MakeEntries(params.device, ir, shader_type), std::move(program))); } -Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { +std::unique_ptr Shader::CreateKernelFromMemory(const ShaderParameters& params, + ProgramCode code) { const std::size_t size_in_bytes = code.size() * sizeof(u64); auto& engine = params.system.GPU().KeplerCompute(); @@ -266,23 +264,23 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog entry.bindless_samplers = registry->GetBindlessSamplers(); params.disk_cache.SaveEntry(std::move(entry)); - return std::shared_ptr( - new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), - MakeEntries(params.device, ir, ShaderType::Compute), std::move(program))); + return std::unique_ptr(new Shader(std::move(registry), + MakeEntries(params.device, ir, ShaderType::Compute), + std::move(program))); } -Shader CachedShader::CreateFromCache(const ShaderParameters& params, - const PrecompiledShader& precompiled_shader, - std::size_t size_in_bytes) { - return std::shared_ptr( - new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry, - precompiled_shader.entries, precompiled_shader.program)); +std::unique_ptr Shader::CreateFromCache(const ShaderParameters& params, + const PrecompiledShader& precompiled_shader) { + return std::unique_ptr(new Shader( + precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program)); } ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, Core::Frontend::EmuWindow& emu_window, const Device& device) - : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device}, - disk_cache{system} {} + : VideoCommon::ShaderCache{rasterizer}, system{system}, + emu_window{emu_window}, device{device}, disk_cache{system} {} + +ShaderCacheOpenGL::~ShaderCacheOpenGL() = default; void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { @@ -436,7 +434,7 @@ ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram( return program; } -Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { +Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) { return last_shaders[static_cast(program)]; } @@ -446,8 +444,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { // Look up shader in the cache based on address const auto cpu_addr{memory_manager.GpuToCpuAddress(address)}; - Shader shader{cpu_addr ? TryGet(*cpu_addr) : null_shader}; - if (shader) { + if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) { return last_shaders[static_cast(program)] = shader; } @@ -468,30 +465,29 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { const ShaderParameters params{system, disk_cache, device, *cpu_addr, host_ptr, unique_identifier}; + std::unique_ptr shader; const auto found = runtime_cache.find(unique_identifier); if (found == runtime_cache.end()) { - shader = CachedShader::CreateStageFromMemory(params, program, std::move(code), - std::move(code_b)); + shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b)); } else { - const std::size_t size_in_bytes = code.size() * sizeof(u64); - shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes); + shader = Shader::CreateFromCache(params, found->second); } + Shader* const result = shader.get(); if (cpu_addr) { - Register(shader); + Register(std::move(shader), *cpu_addr, code.size() * sizeof(u64)); } else { - null_shader = shader; + null_shader = std::move(shader); } - return last_shaders[static_cast(program)] = shader; + return last_shaders[static_cast(program)] = result; } -Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { +Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { auto& memory_manager{system.GPU().MemoryManager()}; const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)}; - auto kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel; - if (kernel) { + if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) { return kernel; } @@ -503,20 +499,21 @@ Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { const ShaderParameters params{system, disk_cache, device, *cpu_addr, host_ptr, unique_identifier}; + std::unique_ptr kernel; const auto found = runtime_cache.find(unique_identifier); if (found == runtime_cache.end()) { - kernel = CachedShader::CreateKernelFromMemory(params, std::move(code)); + kernel = Shader::CreateKernelFromMemory(params, std::move(code)); } else { - const std::size_t size_in_bytes = code.size() * sizeof(u64); - kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes); + kernel = Shader::CreateFromCache(params, found->second); } + Shader* const result = kernel.get(); if (cpu_addr) { - Register(kernel); + Register(std::move(kernel), *cpu_addr, code.size() * sizeof(u64)); } else { - null_kernel = kernel; + null_kernel = std::move(kernel); } - return kernel; + return result; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index b2ae8d7f97..6848f13884 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -18,12 +18,12 @@ #include "common/common_types.h" #include "video_core/engines/shader_type.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h" namespace Core { class System; @@ -35,12 +35,10 @@ class EmuWindow; namespace OpenGL { -class CachedShader; class Device; class RasterizerOpenGL; struct UnspecializedShader; -using Shader = std::shared_ptr; using Maxwell = Tegra::Engines::Maxwell3D::Regs; struct ProgramHandle { @@ -64,62 +62,53 @@ struct ShaderParameters { u64 unique_identifier; }; -class CachedShader final : public RasterizerCacheObject { +class Shader final { public: - ~CachedShader(); + ~Shader(); /// Gets the GL program handle for the shader GLuint GetHandle() const; - /// Returns the size in bytes of the shader - std::size_t GetSizeInBytes() const override { - return size_in_bytes; - } - /// Gets the shader entries for the shader const ShaderEntries& GetEntries() const { return entries; } - static Shader CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, - ProgramCode program_code, ProgramCode program_code_b); - static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code); + static std::unique_ptr CreateStageFromMemory(const ShaderParameters& params, + Maxwell::ShaderProgram program_type, + ProgramCode program_code, + ProgramCode program_code_b); + static std::unique_ptr CreateKernelFromMemory(const ShaderParameters& params, + ProgramCode code); - static Shader CreateFromCache(const ShaderParameters& params, - const PrecompiledShader& precompiled_shader, - std::size_t size_in_bytes); + static std::unique_ptr CreateFromCache(const ShaderParameters& params, + const PrecompiledShader& precompiled_shader); private: - explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, - std::shared_ptr registry, - ShaderEntries entries, ProgramSharedPtr program); + explicit Shader(std::shared_ptr registry, ShaderEntries entries, + ProgramSharedPtr program); std::shared_ptr registry; ShaderEntries entries; - std::size_t size_in_bytes = 0; ProgramSharedPtr program; GLuint handle = 0; }; -class ShaderCacheOpenGL final : public RasterizerCache { +class ShaderCacheOpenGL final : public VideoCommon::ShaderCache { public: explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, Core::Frontend::EmuWindow& emu_window, const Device& device); + ~ShaderCacheOpenGL() override; /// Loads disk cache for the current game void LoadDiskCache(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback); /// Gets the current specified shader stage program - Shader GetStageProgram(Maxwell::ShaderProgram program); + Shader* GetStageProgram(Maxwell::ShaderProgram program); /// Gets a compute kernel in the passed address - Shader GetComputeKernel(GPUVAddr code_addr); - -protected: - // We do not have to flush this cache as things in it are never modified by us. - void FlushObjectInner(const Shader& object) override {} + Shader* GetComputeKernel(GPUVAddr code_addr); private: ProgramSharedPtr GeneratePrecompiledProgram( @@ -132,10 +121,10 @@ private: ShaderDiskCacheOpenGL disk_cache; std::unordered_map runtime_cache; - Shader null_shader{}; - Shader null_kernel{}; + std::unique_ptr null_shader; + std::unique_ptr null_kernel; - std::array last_shaders; + std::array last_shaders{}; }; } // namespace OpenGL From 678f95e4f824740899c31a94c6aa6adaa634e699 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 22 May 2020 21:01:36 -0300 Subject: [PATCH 047/145] vk_pipeline_cache: Use generic shader cache Trivial port the generic shader cache to Vulkan. --- .../renderer_opengl/gl_shader_cache.cpp | 6 +- .../renderer_vulkan/vk_pipeline_cache.cpp | 65 ++++++++++--------- .../renderer_vulkan/vk_pipeline_cache.h | 33 ++++------ .../renderer_vulkan/vk_rasterizer.cpp | 7 +- .../renderer_vulkan/vk_rasterizer.h | 2 +- 5 files changed, 55 insertions(+), 58 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 7c6797e02f..c28486b1db 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -195,9 +195,9 @@ std::unordered_set GetSupportedFormats() { } // Anonymous namespace -Shader::Shader(std::shared_ptr registry, ShaderEntries entries, - ProgramSharedPtr program) - : registry{std::move(registry)}, entries{std::move(entries)}, program{std::move(program)} { +Shader::Shader(std::shared_ptr registry_, ShaderEntries entries_, + ProgramSharedPtr program_) + : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)} { handle = program->assembly_program.handle; if (handle == 0) { handle = program->source_program.handle; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 65a1c62454..20cbeb671e 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -27,6 +27,7 @@ #include "video_core/renderer_vulkan/wrapper.h" #include "video_core/shader/compiler_settings.h" #include "video_core/shader/memory_util.h" +#include "video_core/shader_cache.h" namespace Vulkan { @@ -130,19 +131,18 @@ bool ComputePipelineCacheKey::operator==(const ComputePipelineCacheKey& rhs) con return std::memcmp(&rhs, this, sizeof *this) == 0; } -CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, - GPUVAddr gpu_addr, VAddr cpu_addr, ProgramCode program_code, - u32 main_offset) - : RasterizerCacheObject{cpu_addr}, gpu_addr{gpu_addr}, program_code{std::move(program_code)}, +Shader::Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, + VideoCommon::Shader::ProgramCode program_code, u32 main_offset) + : gpu_addr{gpu_addr}, program_code{std::move(program_code)}, registry{stage, GetEngine(system, stage)}, shader_ir{this->program_code, main_offset, compiler_settings, registry}, entries{GenerateShaderEntries(shader_ir)} {} -CachedShader::~CachedShader() = default; +Shader::~Shader() = default; -Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine( - Core::System& system, Tegra::Engines::ShaderType stage) { - if (stage == Tegra::Engines::ShaderType::Compute) { +Tegra::Engines::ConstBufferEngineInterface& Shader::GetEngine(Core::System& system, + Tegra::Engines::ShaderType stage) { + if (stage == ShaderType::Compute) { return system.GPU().KeplerCompute(); } else { return system.GPU().Maxwell3D(); @@ -154,16 +154,16 @@ VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasteri VKDescriptorPool& descriptor_pool, VKUpdateDescriptorQueue& update_descriptor_queue, VKRenderPassCache& renderpass_cache) - : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler}, - descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue}, - renderpass_cache{renderpass_cache} {} + : VideoCommon::ShaderCache{rasterizer}, system{system}, device{device}, + scheduler{scheduler}, descriptor_pool{descriptor_pool}, + update_descriptor_queue{update_descriptor_queue}, renderpass_cache{renderpass_cache} {} VKPipelineCache::~VKPipelineCache() = default; -std::array VKPipelineCache::GetShaders() { +std::array VKPipelineCache::GetShaders() { const auto& gpu = system.GPU().Maxwell3D(); - std::array shaders; + std::array shaders{}; for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { const auto program{static_cast(index)}; @@ -176,24 +176,28 @@ std::array VKPipelineCache::GetShaders() { const GPUVAddr program_addr{GetShaderAddress(system, program)}; const std::optional cpu_addr = memory_manager.GpuToCpuAddress(program_addr); ASSERT(cpu_addr); - auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader; - if (!shader) { + + Shader* result = cpu_addr ? TryGet(*cpu_addr) : null_shader.get(); + if (!result) { const auto host_ptr{memory_manager.GetPointer(program_addr)}; // No shader found - create a new one constexpr u32 stage_offset = STAGE_MAIN_OFFSET; - const auto stage = static_cast(index == 0 ? 0 : index - 1); + const auto stage = static_cast(index == 0 ? 0 : index - 1); ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, false); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + + auto shader = std::make_unique(system, stage, program_addr, std::move(code), + stage_offset); + result = shader.get(); - shader = std::make_shared(system, stage, program_addr, *cpu_addr, - std::move(code), stage_offset); if (cpu_addr) { - Register(shader); + Register(std::move(shader), *cpu_addr, size_in_bytes); } else { - null_shader = shader; + null_shader = std::move(shader); } } - shaders[index] = std::move(shader); + shaders[index] = result; } return last_shaders = shaders; } @@ -234,19 +238,22 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach const auto cpu_addr = memory_manager.GpuToCpuAddress(program_addr); ASSERT(cpu_addr); - auto shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel; + Shader* shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get(); if (!shader) { // No shader found - create a new one const auto host_ptr = memory_manager.GetPointer(program_addr); ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, true); - shader = std::make_shared(system, Tegra::Engines::ShaderType::Compute, - program_addr, *cpu_addr, std::move(code), - KERNEL_MAIN_OFFSET); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + + auto shader_info = std::make_unique(system, ShaderType::Compute, program_addr, + std::move(code), KERNEL_MAIN_OFFSET); + shader = shader_info.get(); + if (cpu_addr) { - Register(shader); + Register(std::move(shader_info), *cpu_addr, size_in_bytes); } else { - null_kernel = shader; + null_kernel = std::move(shader_info); } } @@ -262,7 +269,7 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach return *entry; } -void VKPipelineCache::Unregister(const Shader& shader) { +void VKPipelineCache::OnShaderRemoval(Shader* shader) { bool finished = false; const auto Finish = [&] { // TODO(Rodrigo): Instead of finishing here, wait for the fences that use this pipeline and @@ -294,8 +301,6 @@ void VKPipelineCache::Unregister(const Shader& shader) { Finish(); it = compute_cache.erase(it); } - - RasterizerCache::Unregister(shader); } std::pair> diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index 0b5796fefb..0a36e51129 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -17,7 +17,6 @@ #include "common/common_types.h" #include "video_core/engines/const_buffer_engine_interface.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_renderpass_cache.h" @@ -26,6 +25,7 @@ #include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h" namespace Core { class System; @@ -41,8 +41,6 @@ class VKFence; class VKScheduler; class VKUpdateDescriptorQueue; -class CachedShader; -using Shader = std::shared_ptr; using Maxwell = Tegra::Engines::Maxwell3D::Regs; struct GraphicsPipelineCacheKey { @@ -102,21 +100,16 @@ struct hash { namespace Vulkan { -class CachedShader final : public RasterizerCacheObject { +class Shader { public: - explicit CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, - VAddr cpu_addr, VideoCommon::Shader::ProgramCode program_code, - u32 main_offset); - ~CachedShader(); + explicit Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, + VideoCommon::Shader::ProgramCode program_code, u32 main_offset); + ~Shader(); GPUVAddr GetGpuAddr() const { return gpu_addr; } - std::size_t GetSizeInBytes() const override { - return program_code.size() * sizeof(u64); - } - VideoCommon::Shader::ShaderIR& GetIR() { return shader_ir; } @@ -144,25 +137,23 @@ private: ShaderEntries entries; }; -class VKPipelineCache final : public RasterizerCache { +class VKPipelineCache final : public VideoCommon::ShaderCache { public: explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, const VKDevice& device, VKScheduler& scheduler, VKDescriptorPool& descriptor_pool, VKUpdateDescriptorQueue& update_descriptor_queue, VKRenderPassCache& renderpass_cache); - ~VKPipelineCache(); + ~VKPipelineCache() override; - std::array GetShaders(); + std::array GetShaders(); VKGraphicsPipeline& GetGraphicsPipeline(const GraphicsPipelineCacheKey& key); VKComputePipeline& GetComputePipeline(const ComputePipelineCacheKey& key); protected: - void Unregister(const Shader& shader) override; - - void FlushObjectInner(const Shader& object) override {} + void OnShaderRemoval(Shader* shader) final; private: std::pair> DecompileShaders( @@ -175,10 +166,10 @@ private: VKUpdateDescriptorQueue& update_descriptor_queue; VKRenderPassCache& renderpass_cache; - Shader null_shader{}; - Shader null_kernel{}; + std::unique_ptr null_shader; + std::unique_ptr null_kernel; - std::array last_shaders; + std::array last_shaders{}; GraphicsPipelineCacheKey last_graphics_key; VKGraphicsPipeline* last_graphics_pipeline = nullptr; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 86328237ea..ffea9ee36b 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -38,6 +38,7 @@ #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/shader_cache.h" namespace Vulkan { @@ -98,7 +99,7 @@ VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) { } std::array GetShaderAddresses( - const std::array& shaders) { + const std::array& shaders) { std::array addresses; for (std::size_t i = 0; i < std::size(addresses); ++i) { addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0; @@ -775,12 +776,12 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt } void RasterizerVulkan::SetupShaderDescriptors( - const std::array& shaders) { + const std::array& shaders) { texture_cache.GuardSamplers(true); for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { // Skip VertexA stage - const auto& shader = shaders[stage + 1]; + Shader* const shader = shaders[stage + 1]; if (!shader) { continue; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 0ed0e48c67..ef77c36222 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -168,7 +168,7 @@ private: bool is_indexed, bool is_instanced); /// Setup descriptors in the graphics pipeline. - void SetupShaderDescriptors(const std::array& shaders); + void SetupShaderDescriptors(const std::array& shaders); void SetupImageTransitions(Texceptions texceptions, const std::array& color_attachments, From abcea1bb188cb4db0ee7e27bea26d6458c881c2d Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 22 May 2020 21:03:57 -0300 Subject: [PATCH 048/145] rasterizer_cache: Remove files and includes The rasterizer cache is no longer used. Each cache has its own generic implementation optimized for the cached data. --- src/video_core/CMakeLists.txt | 2 - src/video_core/rasterizer_cache.cpp | 7 - src/video_core/rasterizer_cache.h | 253 ------------------ .../renderer_opengl/gl_buffer_cache.h | 1 - .../renderer_opengl/gl_rasterizer.h | 1 - .../renderer_vulkan/vk_buffer_cache.h | 1 - .../renderer_vulkan/vk_pipeline_cache.cpp | 7 +- 7 files changed, 3 insertions(+), 269 deletions(-) delete mode 100644 src/video_core/rasterizer_cache.cpp delete mode 100644 src/video_core/rasterizer_cache.h diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index cf1885f92b..39d5d84014 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -49,8 +49,6 @@ add_library(video_core STATIC query_cache.h rasterizer_accelerated.cpp rasterizer_accelerated.h - rasterizer_cache.cpp - rasterizer_cache.h rasterizer_interface.h renderer_base.cpp renderer_base.h diff --git a/src/video_core/rasterizer_cache.cpp b/src/video_core/rasterizer_cache.cpp deleted file mode 100644 index 093b2cdf4e..0000000000 --- a/src/video_core/rasterizer_cache.cpp +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "video_core/rasterizer_cache.h" - -RasterizerCacheObject::~RasterizerCacheObject() = default; diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h deleted file mode 100644 index 096ee337cc..0000000000 --- a/src/video_core/rasterizer_cache.h +++ /dev/null @@ -1,253 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include -#include -#include - -#include -#include - -#include "common/common_types.h" -#include "core/settings.h" -#include "video_core/gpu.h" -#include "video_core/rasterizer_interface.h" - -class RasterizerCacheObject { -public: - explicit RasterizerCacheObject(const VAddr cpu_addr) : cpu_addr{cpu_addr} {} - - virtual ~RasterizerCacheObject(); - - VAddr GetCpuAddr() const { - return cpu_addr; - } - - /// Gets the size of the shader in guest memory, required for cache management - virtual std::size_t GetSizeInBytes() const = 0; - - /// Sets whether the cached object should be considered registered - void SetIsRegistered(bool registered) { - is_registered = registered; - } - - /// Returns true if the cached object is registered - bool IsRegistered() const { - return is_registered; - } - - /// Returns true if the cached object is dirty - bool IsDirty() const { - return is_dirty; - } - - /// Returns ticks from when this cached object was last modified - u64 GetLastModifiedTicks() const { - return last_modified_ticks; - } - - /// Marks an object as recently modified, used to specify whether it is clean or dirty - template - void MarkAsModified(bool dirty, T& cache) { - is_dirty = dirty; - last_modified_ticks = cache.GetModifiedTicks(); - } - - void SetMemoryMarked(bool is_memory_marked_) { - is_memory_marked = is_memory_marked_; - } - - bool IsMemoryMarked() const { - return is_memory_marked; - } - - void SetSyncPending(bool is_sync_pending_) { - is_sync_pending = is_sync_pending_; - } - - bool IsSyncPending() const { - return is_sync_pending; - } - -private: - bool is_registered{}; ///< Whether the object is currently registered with the cache - bool is_dirty{}; ///< Whether the object is dirty (out of sync with guest memory) - bool is_memory_marked{}; ///< Whether the object is marking rasterizer memory. - bool is_sync_pending{}; ///< Whether the object is pending deletion. - u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing - VAddr cpu_addr{}; ///< Cpu address memory, unique from emulated virtual address space -}; - -template -class RasterizerCache : NonCopyable { - friend class RasterizerCacheObject; - -public: - explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {} - - /// Write any cached resources overlapping the specified region back to memory - void FlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; - - const auto& objects{GetSortedObjectsFromRegion(addr, size)}; - for (auto& object : objects) { - FlushObject(object); - } - } - - /// Mark the specified region as being invalidated - void InvalidateRegion(VAddr addr, u64 size) { - std::lock_guard lock{mutex}; - - const auto& objects{GetSortedObjectsFromRegion(addr, size)}; - for (auto& object : objects) { - if (!object->IsRegistered()) { - // Skip duplicates - continue; - } - Unregister(object); - } - } - - void OnCPUWrite(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; - - for (const auto& object : GetSortedObjectsFromRegion(addr, size)) { - if (object->IsRegistered()) { - UnmarkMemory(object); - object->SetSyncPending(true); - marked_for_unregister.emplace_back(object); - } - } - } - - void SyncGuestHost() { - std::lock_guard lock{mutex}; - - for (const auto& object : marked_for_unregister) { - if (object->IsRegistered()) { - object->SetSyncPending(false); - Unregister(object); - } - } - marked_for_unregister.clear(); - } - - /// Invalidates everything in the cache - void InvalidateAll() { - std::lock_guard lock{mutex}; - - while (interval_cache.begin() != interval_cache.end()) { - Unregister(*interval_cache.begin()->second.begin()); - } - } - -protected: - /// Tries to get an object from the cache with the specified cache address - T TryGet(VAddr addr) const { - const auto iter = map_cache.find(addr); - if (iter != map_cache.end()) - return iter->second; - return nullptr; - } - - /// Register an object into the cache - virtual void Register(const T& object) { - std::lock_guard lock{mutex}; - - object->SetIsRegistered(true); - interval_cache.add({GetInterval(object), ObjectSet{object}}); - map_cache.insert({object->GetCpuAddr(), object}); - rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1); - object->SetMemoryMarked(true); - } - - /// Unregisters an object from the cache - virtual void Unregister(const T& object) { - std::lock_guard lock{mutex}; - - UnmarkMemory(object); - object->SetIsRegistered(false); - if (object->IsSyncPending()) { - marked_for_unregister.remove(object); - object->SetSyncPending(false); - } - const VAddr addr = object->GetCpuAddr(); - interval_cache.subtract({GetInterval(object), ObjectSet{object}}); - map_cache.erase(addr); - } - - void UnmarkMemory(const T& object) { - if (!object->IsMemoryMarked()) { - return; - } - rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1); - object->SetMemoryMarked(false); - } - - /// Returns a ticks counter used for tracking when cached objects were last modified - u64 GetModifiedTicks() { - std::lock_guard lock{mutex}; - - return ++modified_ticks; - } - - virtual void FlushObjectInner(const T& object) = 0; - - /// Flushes the specified object, updating appropriate cache state as needed - void FlushObject(const T& object) { - std::lock_guard lock{mutex}; - - if (!object->IsDirty()) { - return; - } - FlushObjectInner(object); - object->MarkAsModified(false, *this); - } - - std::recursive_mutex mutex; - -private: - /// Returns a list of cached objects from the specified memory region, ordered by access time - std::vector GetSortedObjectsFromRegion(VAddr addr, u64 size) { - if (size == 0) { - return {}; - } - - std::vector objects; - const ObjectInterval interval{addr, addr + size}; - for (auto& pair : boost::make_iterator_range(interval_cache.equal_range(interval))) { - for (auto& cached_object : pair.second) { - if (!cached_object) { - continue; - } - objects.push_back(cached_object); - } - } - - std::sort(objects.begin(), objects.end(), [](const T& a, const T& b) -> bool { - return a->GetLastModifiedTicks() < b->GetLastModifiedTicks(); - }); - - return objects; - } - - using ObjectSet = std::set; - using ObjectCache = std::unordered_map; - using IntervalCache = boost::icl::interval_map; - using ObjectInterval = typename IntervalCache::interval_type; - - static auto GetInterval(const T& object) { - return ObjectInterval::right_open(object->GetCpuAddr(), - object->GetCpuAddr() + object->GetSizeInBytes()); - } - - ObjectCache map_cache; - IntervalCache interval_cache; ///< Cache of objects - u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing - VideoCore::RasterizerInterface& rasterizer; - std::list marked_for_unregister; -}; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index a9e86cfc7c..679b9b1d73 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -10,7 +10,6 @@ #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 1f8d45eac6..208a4485b1 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -19,7 +19,6 @@ #include "video_core/engines/const_buffer_info.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/rasterizer_accelerated.h" -#include "video_core/rasterizer_cache.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index a54583e7d0..65cb3c8ad0 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -8,7 +8,6 @@ #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 20cbeb671e..150d86b621 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -335,12 +335,11 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { } const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum); - const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); - const auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader; - ASSERT(shader); + const std::optional cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); + Shader* const shader = cpu_addr ? TryGet(*cpu_addr) : null_shader.get(); const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5 - const auto program_type = GetShaderType(program_enum); + const ShaderType program_type = GetShaderType(program_enum); const auto& entries = shader->GetEntries(); program[stage] = { Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization), From c95c254f3eda75476ad221a4828033f4140a3470 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 29 May 2020 23:32:41 -0300 Subject: [PATCH 049/145] texture_cache: Implement rendering to 3D textures This allows rendering to 3D textures with more than one slice. Applications are allowed to render to more than one slice of a texture using gl_Layer from a VTG shader. This also requires reworking how 3D texture collisions are handled, for now, this commit allows rendering to slices but not to miplevels. When a render target attempts to write to a mipmap, we fallback to the previous implementation (copying or flushing as needed). - Fixes color correction 3D textures on UE4 games (rainbow effects). - Allows Xenoblade games to render to 3D textures directly. --- src/video_core/engines/maxwell_3d.h | 1 + .../renderer_opengl/gl_texture_cache.cpp | 51 ++++--- .../renderer_opengl/gl_texture_cache.h | 6 +- .../renderer_vulkan/vk_rasterizer.cpp | 9 +- .../renderer_vulkan/vk_texture_cache.cpp | 76 +++++++++-- .../renderer_vulkan/vk_texture_cache.h | 33 ++--- src/video_core/texture_cache/surface_base.cpp | 7 +- src/video_core/texture_cache/surface_base.h | 13 +- .../texture_cache/surface_params.cpp | 17 ++- src/video_core/texture_cache/texture_cache.h | 127 ++++++++---------- 10 files changed, 196 insertions(+), 144 deletions(-) diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index b827b112fc..79fc9bbea6 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -598,6 +598,7 @@ public: BitField<4, 3, u32> block_height; BitField<8, 3, u32> block_depth; BitField<12, 1, InvMemoryLayout> type; + BitField<16, 1, u32> is_3d; } memory_layout; union { BitField<0, 16, u32> layers; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 57db5a08b8..46df15dfce 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -263,9 +263,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param target = GetTextureTarget(params.target); texture = CreateTexture(params, target, internal_format, texture_buffer); DecorateSurfaceName(); - main_view = CreateViewInner( - ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels), - true); + + u32 num_layers = 1; + if (params.is_layered || params.target == SurfaceTarget::Texture3D) { + num_layers = params.depth; + } + + main_view = + CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true); } CachedSurface::~CachedSurface() = default; @@ -413,37 +418,40 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p CachedSurfaceView::~CachedSurfaceView() = default; -void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { +void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const { ASSERT(params.num_levels == 1); - if (params.num_layers > 1) { - // Layered framebuffer attachments - UNIMPLEMENTED_IF(params.base_layer != 0); - - switch (params.target) { - case SurfaceTarget::Texture2DArray: - glFramebufferTexture(target, attachment, GetTexture(), 0); - break; - default: - UNIMPLEMENTED(); + if (params.target == SurfaceTarget::Texture3D) { + if (params.num_layers > 1) { + ASSERT(params.base_layer == 0); + glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level); + } else { + glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle, + params.base_level, params.base_layer); } return; } + if (params.num_layers > 1) { + UNIMPLEMENTED_IF(params.base_layer != 0); + glFramebufferTexture(fb_target, attachment, GetTexture(), 0); + return; + } + const GLenum view_target = surface.GetTarget(); const GLuint texture = surface.GetTexture(); switch (surface.GetSurfaceParams().target) { case SurfaceTarget::Texture1D: - glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level); + glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture2D: - glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level); + glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture1DArray: case SurfaceTarget::Texture2DArray: case SurfaceTarget::TextureCubemap: case SurfaceTarget::TextureCubeArray: - glFramebufferTextureLayer(target, attachment, texture, params.base_level, + glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level, params.base_layer); break; default: @@ -500,8 +508,13 @@ OGLTextureView CachedSurfaceView::CreateTextureView() const { OGLTextureView texture_view; texture_view.Create(); - glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level, - params.num_levels, params.base_layer, params.num_layers); + if (target == GL_TEXTURE_3D) { + glTextureView(texture_view.handle, target, surface.texture.handle, format, + params.base_level, params.num_levels, 0, 1); + } else { + glTextureView(texture_view.handle, target, surface.texture.handle, format, + params.base_level, params.num_levels, params.base_layer, params.num_layers); + } ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle); return texture_view; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 8a2ac86031..bfc4ddf5d1 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -80,8 +80,10 @@ public: explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy); ~CachedSurfaceView(); - /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER - void Attach(GLenum attachment, GLenum target) const; + /// @brief Attaches this texture view to the currently bound fb_target framebuffer + /// @param attachment Attachment to bind textures to + /// @param fb_target Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER) + void Attach(GLenum attachment, GLenum fb_target) const; GLuint GetTexture(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index d86c46412b..19b8f9da37 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -716,7 +716,7 @@ std::tuple RasterizerVulkan::ConfigureFramebuffers( if (!view) { return false; } - key.views.push_back(view->GetHandle()); + key.views.push_back(view->GetAttachment()); key.width = std::min(key.width, view->GetWidth()); key.height = std::min(key.height, view->GetHeight()); key.layers = std::min(key.layers, view->GetNumLayers()); @@ -1137,8 +1137,8 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu auto view = texture_cache.GetTextureSurface(texture.tic, entry); ASSERT(!view->IsBufferView()); - const auto image_view = view->GetHandle(texture.tic.x_source, texture.tic.y_source, - texture.tic.z_source, texture.tic.w_source); + const VkImageView image_view = view->GetImageView(texture.tic.x_source, texture.tic.y_source, + texture.tic.z_source, texture.tic.w_source); const auto sampler = sampler_cache.GetSampler(texture.tsc); update_descriptor_queue.AddSampledImage(sampler, image_view); @@ -1164,7 +1164,8 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima UNIMPLEMENTED_IF(tic.IsBuffer()); - const auto image_view = view->GetHandle(tic.x_source, tic.y_source, tic.z_source, tic.w_source); + const VkImageView image_view = + view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source); update_descriptor_queue.AddImage(image_view); const auto image_layout = update_descriptor_queue.GetLastImageLayout(); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index ea487b7708..430031665c 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -167,6 +167,7 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP ci.extent = {params.width, params.height, 1}; break; case SurfaceTarget::Texture3D: + ci.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT; ci.extent = {params.width, params.height, params.depth}; break; case SurfaceTarget::TextureBuffer: @@ -176,6 +177,12 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP return ci; } +u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source) { + return (static_cast(x_source) << 24) | (static_cast(y_source) << 16) | + (static_cast(z_source) << 8) | static_cast(w_source); +} + } // Anonymous namespace CachedSurface::CachedSurface(Core::System& system, const VKDevice& device, @@ -203,9 +210,11 @@ CachedSurface::CachedSurface(Core::System& system, const VKDevice& device, } // TODO(Rodrigo): Move this to a virtual function. - main_view = CreateViewInner( - ViewParams(params.target, 0, static_cast(params.GetNumLayers()), 0, params.num_levels), - true); + u32 num_layers = 1; + if (params.is_layered || params.target == SurfaceTarget::Texture3D) { + num_layers = params.depth; + } + main_view = CreateView(ViewParams(params.target, 0, num_layers, 0, params.num_levels)); } CachedSurface::~CachedSurface() = default; @@ -253,12 +262,8 @@ void CachedSurface::DecorateSurfaceName() { } View CachedSurface::CreateView(const ViewParams& params) { - return CreateViewInner(params, false); -} - -View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) { // TODO(Rodrigo): Add name decorations - return views[params] = std::make_shared(device, *this, params, is_proxy); + return views[params] = std::make_shared(device, *this, params); } void CachedSurface::UploadBuffer(const std::vector& staging_buffer) { @@ -342,18 +347,27 @@ VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const { } CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface, - const ViewParams& params, bool is_proxy) + const ViewParams& params) : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()}, image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()}, aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface}, - base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level}, - num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target) - : VK_IMAGE_VIEW_TYPE_1D} {} + base_level{params.base_level}, num_levels{params.num_levels}, + image_view_type{image ? GetImageViewType(params.target) : VK_IMAGE_VIEW_TYPE_1D} { + if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { + base_layer = 0; + num_layers = 1; + base_slice = params.base_layer; + num_slices = params.num_layers; + } else { + base_layer = params.base_layer; + num_layers = params.num_layers; + } +} CachedSurfaceView::~CachedSurfaceView() = default; -VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source, - SwizzleSource z_source, SwizzleSource w_source) { +VkImageView CachedSurfaceView::GetImageView(SwizzleSource x_source, SwizzleSource y_source, + SwizzleSource z_source, SwizzleSource w_source) { const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); if (last_image_view && last_swizzle == new_swizzle) { return last_image_view; @@ -399,6 +413,11 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y }); } + if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { + ASSERT(base_slice == 0); + ASSERT(num_slices == params.depth); + } + VkImageViewCreateInfo ci; ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; ci.pNext = nullptr; @@ -417,6 +436,35 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y return last_image_view = *image_view; } +VkImageView CachedSurfaceView::GetAttachment() { + if (render_target) { + return *render_target; + } + + VkImageViewCreateInfo ci; + ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + ci.pNext = nullptr; + ci.flags = 0; + ci.image = surface.GetImageHandle(); + ci.format = surface.GetImage().GetFormat(); + ci.components = {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, + VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY}; + ci.subresourceRange.aspectMask = aspect_mask; + ci.subresourceRange.baseMipLevel = base_level; + ci.subresourceRange.levelCount = num_levels; + if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { + ci.viewType = num_slices > 1 ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_2D; + ci.subresourceRange.baseArrayLayer = base_slice; + ci.subresourceRange.layerCount = num_slices; + } else { + ci.viewType = image_view_type; + ci.subresourceRange.baseArrayLayer = base_layer; + ci.subresourceRange.layerCount = num_layers; + } + render_target = device.GetLogical().CreateImageView(ci); + return *render_target; +} + VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, const VKDevice& device, VKResourceManager& resource_manager, VKMemoryManager& memory_manager, VKScheduler& scheduler, diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index f211ccb1eb..807e26c8a0 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -91,7 +91,6 @@ protected: void DecorateSurfaceName(); View CreateView(const ViewParams& params) override; - View CreateViewInner(const ViewParams& params, bool is_proxy); private: void UploadBuffer(const std::vector& staging_buffer); @@ -120,23 +119,20 @@ private: class CachedSurfaceView final : public VideoCommon::ViewBase { public: explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface, - const ViewParams& params, bool is_proxy); + const ViewParams& params); ~CachedSurfaceView(); - VkImageView GetHandle(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source); + VkImageView GetImageView(Tegra::Texture::SwizzleSource x_source, + Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, + Tegra::Texture::SwizzleSource w_source); + + VkImageView GetAttachment(); bool IsSameSurface(const CachedSurfaceView& rhs) const { return &surface == &rhs.surface; } - VkImageView GetHandle() { - return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G, - Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A); - } - u32 GetWidth() const { return params.GetMipWidth(base_level); } @@ -180,14 +176,6 @@ public: } private: - static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source) { - return (static_cast(x_source) << 24) | (static_cast(y_source) << 16) | - (static_cast(z_source) << 8) | static_cast(w_source); - } - // Store a copy of these values to avoid double dereference when reading them const SurfaceParams params; const VkImage image; @@ -196,15 +184,18 @@ private: const VKDevice& device; CachedSurface& surface; - const u32 base_layer; - const u32 num_layers; const u32 base_level; const u32 num_levels; const VkImageViewType image_view_type; + u32 base_layer = 0; + u32 num_layers = 0; + u32 base_slice = 0; + u32 num_slices = 0; VkImageView last_image_view = nullptr; u32 last_swizzle = 0; + vk::ImageView render_target; std::unordered_map view_cache; }; diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp index 715f39d0dc..94d3a6ae53 100644 --- a/src/video_core/texture_cache/surface_base.cpp +++ b/src/video_core/texture_cache/surface_base.cpp @@ -248,12 +248,11 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager, // Use an extra temporal buffer auto& tmp_buffer = staging_cache.GetBuffer(1); - // Special case for 3D Texture Segments - const bool must_read_current_data = - params.block_depth > 0 && params.target == VideoCore::Surface::SurfaceTarget::Texture2D; tmp_buffer.resize(guest_memory_size); host_ptr = tmp_buffer.data(); - if (must_read_current_data) { + + if (params.target == SurfaceTarget::Texture3D) { + // Special case for 3D texture segments memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size); } diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h index 79e10ffbb1..173f2edba0 100644 --- a/src/video_core/texture_cache/surface_base.h +++ b/src/video_core/texture_cache/surface_base.h @@ -217,8 +217,8 @@ public: } bool IsProtected() const { - // Only 3D Slices are to be protected - return is_target && params.block_depth > 0; + // Only 3D slices are to be protected + return is_target && params.target == SurfaceTarget::Texture3D; } bool IsRenderTarget() const { @@ -250,6 +250,11 @@ public: return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels)); } + TView Emplace3DView(u32 slice, u32 depth, u32 base_level, u32 num_levels) { + return GetView(ViewParams(VideoCore::Surface::SurfaceTarget::Texture3D, slice, depth, + base_level, num_levels)); + } + std::optional EmplaceIrregularView(const SurfaceParams& view_params, const GPUVAddr view_addr, const std::size_t candidate_size, const u32 mipmap, @@ -272,8 +277,8 @@ public: std::optional EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr, const std::size_t candidate_size) { if (params.target == SurfaceTarget::Texture3D || - (params.num_levels == 1 && !params.is_layered) || - view_params.target == SurfaceTarget::Texture3D) { + view_params.target == SurfaceTarget::Texture3D || + (params.num_levels == 1 && !params.is_layered)) { return {}; } const auto layer_mipmap{GetLayerMipmap(view_addr)}; diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp index 884fabffe2..642eeb850f 100644 --- a/src/video_core/texture_cache/surface_params.cpp +++ b/src/video_core/texture_cache/surface_params.cpp @@ -215,10 +215,19 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz params.num_levels = 1; params.emulated_levels = 1; - const bool is_layered = config.layers > 1 && params.block_depth == 0; - params.is_layered = is_layered; - params.depth = is_layered ? config.layers.Value() : 1; - params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D; + if (config.memory_layout.is_3d != 0) { + params.depth = config.layers.Value(); + params.is_layered = false; + params.target = SurfaceTarget::Texture3D; + } else if (config.layers > 1) { + params.depth = config.layers.Value(); + params.is_layered = true; + params.target = SurfaceTarget::Texture2DArray; + } else { + params.depth = 1; + params.is_layered = false; + params.target = SurfaceTarget::Texture2D; + } return params; } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 6f63217a2d..4ee0d76b94 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -508,12 +508,12 @@ private: return RecycleStrategy::Flush; } // 3D Textures decision - if (params.block_depth > 1 || params.target == SurfaceTarget::Texture3D) { + if (params.target == SurfaceTarget::Texture3D) { return RecycleStrategy::Flush; } for (const auto& s : overlaps) { const auto& s_params = s->GetSurfaceParams(); - if (s_params.block_depth > 1 || s_params.target == SurfaceTarget::Texture3D) { + if (s_params.target == SurfaceTarget::Texture3D) { return RecycleStrategy::Flush; } } @@ -726,76 +726,60 @@ private: * @param params The parameters on the new surface. * @param gpu_addr The starting address of the new surface. * @param cpu_addr The starting address of the new surface on physical memory. - * @param preserve_contents Indicates that the new surface should be loaded from memory or - * left blank. */ std::optional> Manage3DSurfaces(VectorSurface& overlaps, const SurfaceParams& params, - const GPUVAddr gpu_addr, - const VAddr cpu_addr, - bool preserve_contents) { - if (params.target == SurfaceTarget::Texture3D) { - bool failed = false; - if (params.num_levels > 1) { - // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach - return std::nullopt; - } - TSurface new_surface = GetUncachedSurface(gpu_addr, params); - bool modified = false; - for (auto& surface : overlaps) { - const SurfaceParams& src_params = surface->GetSurfaceParams(); - if (src_params.target != SurfaceTarget::Texture2D) { - failed = true; - break; - } - if (src_params.height != params.height) { - failed = true; - break; - } - if (src_params.block_depth != params.block_depth || - src_params.block_height != params.block_height) { - failed = true; - break; - } - const u32 offset = static_cast(surface->GetCpuAddr() - cpu_addr); - const auto offsets = params.GetBlockOffsetXYZ(offset); - const auto z = std::get<2>(offsets); - modified |= surface->IsModified(); - const CopyParams copy_params(0, 0, 0, 0, 0, z, 0, 0, params.width, params.height, - 1); - ImageCopy(surface, new_surface, copy_params); - } - if (failed) { - return std::nullopt; - } - for (const auto& surface : overlaps) { - Unregister(surface); - } - new_surface->MarkAsModified(modified, Tick()); - Register(new_surface); - auto view = new_surface->GetMainView(); - return {{std::move(new_surface), view}}; - } else { - for (const auto& surface : overlaps) { - if (!surface->MatchTarget(params.target)) { - if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) { - if (Settings::IsGPULevelExtreme()) { - return std::nullopt; - } - Unregister(surface); - return InitializeSurface(gpu_addr, params, preserve_contents); - } - return std::nullopt; - } - if (surface->GetCpuAddr() != cpu_addr) { - continue; - } - if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) { - return {{surface, surface->GetMainView()}}; - } - } - return InitializeSurface(gpu_addr, params, preserve_contents); + GPUVAddr gpu_addr, VAddr cpu_addr) { + if (params.num_levels > 1) { + // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach + return std::nullopt; } + + if (overlaps.size() == 1) { + const auto& surface = overlaps[0]; + const SurfaceParams& overlap_params = surface->GetSurfaceParams(); + // Don't attempt to render to textures with more than one level for now + // The texture has to be to the right or the sample address if we want to render to it + if (overlap_params.num_levels == 1 && cpu_addr >= surface->GetCpuAddr()) { + const u32 offset = static_cast(cpu_addr - surface->GetCpuAddr()); + const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); + if (slice < overlap_params.depth) { + auto view = surface->Emplace3DView(slice, params.depth, 0, 1); + return std::make_pair(std::move(surface), std::move(view)); + } + } + } + + if (params.depth == 1) { + return std::nullopt; + } + + TSurface new_surface = GetUncachedSurface(gpu_addr, params); + bool modified = false; + for (auto& surface : overlaps) { + const SurfaceParams& src_params = surface->GetSurfaceParams(); + if (src_params.height != params.height || + src_params.block_depth != params.block_depth || + src_params.block_height != params.block_height) { + return std::nullopt; + } + modified |= surface->IsModified(); + + const u32 offset = static_cast(surface->GetCpuAddr() - cpu_addr); + const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); + const u32 width = params.width; + const u32 height = params.height; + const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1); + ImageCopy(surface, new_surface, copy_params); + } + for (const auto& surface : overlaps) { + Unregister(surface); + } + new_surface->MarkAsModified(modified, Tick()); + Register(new_surface); + + auto view = new_surface->GetMainView(); + return std::make_pair(std::move(new_surface), std::move(view)); } /** @@ -873,10 +857,9 @@ private: } } - // Check if it's a 3D texture - if (params.block_depth > 0) { - auto surface = - Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents); + // Manage 3D textures + if (params.target == SurfaceTarget::Texture3D) { + auto surface = Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr); if (surface) { return *surface; } From 3c2ae53b4c574deb4f9afe3104c7d022c53c5281 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Mon, 1 Jun 2020 04:49:35 -0300 Subject: [PATCH 050/145] texture_cache: Handle 3D texture blits with one layer --- src/video_core/renderer_opengl/gl_texture_cache.cpp | 4 ++-- src/video_core/texture_cache/surface_params.cpp | 4 ++-- src/video_core/texture_cache/texture_cache.h | 7 ++++++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 46df15dfce..61505879b9 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -557,8 +557,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, const Tegra::Engines::Fermi2D::Config& copy_config) { const auto& src_params{src_view->GetSurfaceParams()}; const auto& dst_params{dst_view->GetSurfaceParams()}; - UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D); - UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D); + UNIMPLEMENTED_IF(src_params.depth != 1); + UNIMPLEMENTED_IF(dst_params.depth != 1); state_tracker.NotifyScissor0(); state_tracker.NotifyFramebuffer(); diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp index 642eeb850f..6fe7c85aca 100644 --- a/src/video_core/texture_cache/surface_params.cpp +++ b/src/video_core/texture_cache/surface_params.cpp @@ -246,8 +246,8 @@ SurfaceParams SurfaceParams::CreateForFermiCopySurface( params.width = config.width; params.height = config.height; params.pitch = config.pitch; - // TODO(Rodrigo): Try to guess the surface target from depth and layer parameters - params.target = SurfaceTarget::Texture2D; + // TODO(Rodrigo): Try to guess texture arrays from parameters + params.target = params.block_depth > 0 ? SurfaceTarget::Texture3D : SurfaceTarget::Texture2D; params.depth = 1; params.num_levels = 1; params.emulated_levels = 1; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 4ee0d76b94..60b95a8549 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -755,6 +755,8 @@ private: } TSurface new_surface = GetUncachedSurface(gpu_addr, params); + LoadSurface(new_surface); + bool modified = false; for (auto& surface : overlaps) { const SurfaceParams& src_params = surface->GetSurfaceParams(); @@ -763,7 +765,10 @@ private: src_params.block_height != params.block_height) { return std::nullopt; } - modified |= surface->IsModified(); + if (!surface->IsModified()) { + continue; + } + modified = true; const u32 offset = static_cast(surface->GetCpuAddr() - cpu_addr); const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); From c99f5d405b6a24603ff8174aeb2952facc4a92d9 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Mon, 8 Jun 2020 05:01:44 -0300 Subject: [PATCH 051/145] texture_cache: Simplify blit code --- src/video_core/texture_cache/texture_cache.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 60b95a8549..b19eeed664 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -298,15 +298,13 @@ public: const GPUVAddr src_gpu_addr = src_config.Address(); const GPUVAddr dst_gpu_addr = dst_config.Address(); DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr); - const std::optional dst_cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(dst_gpu_addr); - const std::optional src_cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr); - std::pair dst_surface = - GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false); - std::pair src_surface = - GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false); - ImageBlit(src_surface.second, dst_surface.second, copy_config); + + const auto& memory_manager = system.GPU().MemoryManager(); + const std::optional dst_cpu_addr = memory_manager.GpuToCpuAddress(dst_gpu_addr); + const std::optional src_cpu_addr = memory_manager.GpuToCpuAddress(src_gpu_addr); + std::pair dst_surface = GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false); + TView src_surface = GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false).second; + ImageBlit(src_surface, dst_surface.second, copy_config); dst_surface.first->MarkAsModified(true, Tick()); } From bd43c0547085fcfb585ac3a90521eeb8414fd538 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Mon, 8 Jun 2020 05:02:22 -0300 Subject: [PATCH 052/145] texture_cache: Port original code management for 2D vs 3D textures Handle blits to images as 2D, even when they have block depth. - Fixes rendering issues on Luigi's Mansion 3 --- .../texture_cache/surface_params.cpp | 2 +- src/video_core/texture_cache/texture_cache.h | 49 +++++++++++++------ 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp index 6fe7c85aca..0b2b2b8c43 100644 --- a/src/video_core/texture_cache/surface_params.cpp +++ b/src/video_core/texture_cache/surface_params.cpp @@ -247,7 +247,7 @@ SurfaceParams SurfaceParams::CreateForFermiCopySurface( params.height = config.height; params.pitch = config.pitch; // TODO(Rodrigo): Try to guess texture arrays from parameters - params.target = params.block_depth > 0 ? SurfaceTarget::Texture3D : SurfaceTarget::Texture2D; + params.target = SurfaceTarget::Texture2D; params.depth = 1; params.num_levels = 1; params.emulated_levels = 1; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index b19eeed664..b543fc8c03 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -724,10 +724,35 @@ private: * @param params The parameters on the new surface. * @param gpu_addr The starting address of the new surface. * @param cpu_addr The starting address of the new surface on physical memory. + * @param preserve_contents Indicates that the new surface should be loaded from memory or + * left blank. */ std::optional> Manage3DSurfaces(VectorSurface& overlaps, const SurfaceParams& params, - GPUVAddr gpu_addr, VAddr cpu_addr) { + GPUVAddr gpu_addr, VAddr cpu_addr, + bool preserve_contents) { + if (params.target != SurfaceTarget::Texture3D) { + for (const auto& surface : overlaps) { + if (!surface->MatchTarget(params.target)) { + if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) { + if (Settings::IsGPULevelExtreme()) { + return std::nullopt; + } + Unregister(surface); + return InitializeSurface(gpu_addr, params, preserve_contents); + } + return std::nullopt; + } + if (surface->GetCpuAddr() != cpu_addr) { + continue; + } + if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) { + return std::make_pair(surface, surface->GetMainView()); + } + } + return InitializeSurface(gpu_addr, params, preserve_contents); + } + if (params.num_levels > 1) { // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach return std::nullopt; @@ -748,25 +773,18 @@ private: } } - if (params.depth == 1) { - return std::nullopt; - } - TSurface new_surface = GetUncachedSurface(gpu_addr, params); - LoadSurface(new_surface); - bool modified = false; + for (auto& surface : overlaps) { const SurfaceParams& src_params = surface->GetSurfaceParams(); - if (src_params.height != params.height || + if (src_params.target != SurfaceTarget::Texture2D || + src_params.height != params.height || src_params.block_depth != params.block_depth || src_params.block_height != params.block_height) { return std::nullopt; } - if (!surface->IsModified()) { - continue; - } - modified = true; + modified |= surface->IsModified(); const u32 offset = static_cast(surface->GetCpuAddr() - cpu_addr); const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); @@ -781,7 +799,7 @@ private: new_surface->MarkAsModified(modified, Tick()); Register(new_surface); - auto view = new_surface->GetMainView(); + TView view = new_surface->GetMainView(); return std::make_pair(std::move(new_surface), std::move(view)); } @@ -861,8 +879,9 @@ private: } // Manage 3D textures - if (params.target == SurfaceTarget::Texture3D) { - auto surface = Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr); + if (params.block_depth > 0) { + auto surface = + Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents); if (surface) { return *surface; } From 6e122f0b2c3fe6a2cccd15256a04bcd4342c86f7 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Mon, 8 Jun 2020 20:22:31 -0300 Subject: [PATCH 053/145] buffer_cache: Return stream buffer invalidation in Map instead of Unmap We have to invalidate whatever cache is being used before uploading the data, hence it makes more sense to return this on Map instead of Unmap. --- src/video_core/buffer_cache/buffer_cache.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index b88fce2cd5..77ae343390 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -110,19 +110,23 @@ public: }); } - void Map(std::size_t max_size) { + /// Prepares the buffer cache for data uploading + /// @param max_size Maximum number of bytes that will be uploaded + /// @return True when a stream buffer invalidation was required, false otherwise + bool Map(std::size_t max_size) { std::lock_guard lock{mutex}; + bool invalidated; std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4); buffer_offset = buffer_offset_base; + + return invalidated; } - /// Finishes the upload stream, returns true on bindings invalidation. - bool Unmap() { + /// Finishes the upload stream + void Unmap() { std::lock_guard lock{mutex}; - stream_buffer->Unmap(buffer_offset - buffer_offset_base); - return std::exchange(invalidated, false); } void TickFrame() { @@ -576,8 +580,6 @@ private: std::unique_ptr stream_buffer; BufferType stream_buffer_handle{}; - bool invalidated = false; - u8* buffer_ptr = nullptr; u64 buffer_offset = 0; u64 buffer_offset_base = 0; From 7646f2c21dcd3e36073142da84d56cd7640476a9 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Mon, 8 Jun 2020 20:24:16 -0300 Subject: [PATCH 054/145] gl_rasterizer: Mark vertex buffers as dirty after buffer cache invalidation Vertex buffers bindings become invalid after the stream buffer is invalidated. We were originally doing this, but it got lost at some point. - Fixes Animal Crossing: New Horizons, but it affects everything. --- src/video_core/renderer_opengl/gl_rasterizer.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 55e79aaf65..5673e30b33 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -576,7 +576,16 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); // Prepare the vertex array. - buffer_cache.Map(buffer_size); + const bool invalidated = buffer_cache.Map(buffer_size); + + if (invalidated) { + // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty + auto& dirty = gpu.dirty.flags; + dirty[Dirty::VertexBuffers] = true; + for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) { + dirty[index] = true; + } + } // Prepare vertex array format. SetupVertexFormat(); From 6508cdd00351e51c7d5867c00da60781c133ade8 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Tue, 9 Jun 2020 18:27:59 -0300 Subject: [PATCH 055/145] buffer_cache: Avoid passing references of shared pointers and misc style changes Instead of using as template argument a shared pointer, use the underlying type and manage shared pointers explicitly. This can make removing shared pointers from the cache more easy. While we are at it, make some misc style changes and general improvements (like insert_or_assign instead of operator[] + operator=). --- src/video_core/buffer_cache/buffer_block.h | 27 ++- src/video_core/buffer_cache/buffer_cache.h | 199 +++++++++--------- .../renderer_opengl/gl_buffer_cache.cpp | 21 +- .../renderer_opengl/gl_buffer_cache.h | 18 +- .../renderer_opengl/gl_stream_buffer.cpp | 8 - .../renderer_opengl/gl_stream_buffer.h | 11 +- .../renderer_vulkan/vk_buffer_cache.cpp | 22 +- .../renderer_vulkan/vk_buffer_cache.h | 16 +- .../renderer_vulkan/vk_stream_buffer.h | 2 +- 9 files changed, 150 insertions(+), 174 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h index e35ee0b673..e64170e668 100644 --- a/src/video_core/buffer_cache/buffer_block.h +++ b/src/video_core/buffer_cache/buffer_block.h @@ -15,48 +15,47 @@ namespace VideoCommon { class BufferBlock { public: - bool Overlaps(const VAddr start, const VAddr end) const { + bool Overlaps(VAddr start, VAddr end) const { return (cpu_addr < end) && (cpu_addr_end > start); } - bool IsInside(const VAddr other_start, const VAddr other_end) const { + bool IsInside(VAddr other_start, VAddr other_end) const { return cpu_addr <= other_start && other_end <= cpu_addr_end; } - std::size_t GetOffset(const VAddr in_addr) { + std::size_t Offset(VAddr in_addr) const { return static_cast(in_addr - cpu_addr); } - VAddr GetCpuAddr() const { + VAddr CpuAddr() const { return cpu_addr; } - VAddr GetCpuAddrEnd() const { + VAddr CpuAddrEnd() const { return cpu_addr_end; } - void SetCpuAddr(const VAddr new_addr) { + void SetCpuAddr(VAddr new_addr) { cpu_addr = new_addr; cpu_addr_end = new_addr + size; } - std::size_t GetSize() const { + std::size_t Size() const { return size; } + u64 Epoch() const { + return epoch; + } + void SetEpoch(u64 new_epoch) { epoch = new_epoch; } - u64 GetEpoch() { - return epoch; - } - protected: - explicit BufferBlock(VAddr cpu_addr, const std::size_t size) : size{size} { - SetCpuAddr(cpu_addr); + explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} { + SetCpuAddr(cpu_addr_); } - ~BufferBlock() = default; private: VAddr cpu_addr{}; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index b88fce2cd5..efc480d08f 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -30,12 +30,16 @@ namespace VideoCommon { -template +template class BufferCache { using IntervalSet = boost::icl::interval_set; using IntervalType = typename IntervalSet::interval_type; using VectorMapInterval = boost::container::small_vector; + static constexpr u64 WRITE_PAGE_BIT = 11; + static constexpr u64 BLOCK_PAGE_BITS = 21; + static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; + public: using BufferInfo = std::pair; @@ -82,7 +86,7 @@ public: } } - OwnerBuffer block = GetBlock(cpu_addr, size); + Buffer* const block = GetBlock(cpu_addr, size); MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size); if (!map) { return {GetEmptyBuffer(size), 0}; @@ -98,7 +102,7 @@ public: } } - return {ToHandle(block), static_cast(block->GetOffset(cpu_addr))}; + return {block->Handle(), static_cast(block->Offset(cpu_addr))}; } /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. @@ -125,16 +129,18 @@ public: return std::exchange(invalidated, false); } + /// Function called at the end of each frame, inteded for deferred operations void TickFrame() { ++epoch; + while (!pending_destruction.empty()) { // Delay at least 4 frames before destruction. // This is due to triple buffering happening on some drivers. static constexpr u64 epochs_to_destroy = 5; - if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) { + if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) { break; } - pending_destruction.pop_front(); + pending_destruction.pop(); } } @@ -249,23 +255,21 @@ public: protected: explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, - std::unique_ptr stream_buffer) - : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)}, - stream_buffer_handle{this->stream_buffer->GetHandle()} {} + std::unique_ptr stream_buffer_) + : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer_)}, + stream_buffer_handle{stream_buffer->Handle()} {} ~BufferCache() = default; - virtual BufferType ToHandle(const OwnerBuffer& storage) = 0; + virtual std::shared_ptr CreateBlock(VAddr cpu_addr, std::size_t size) = 0; - virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0; - - virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size, + virtual void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, const u8* data) = 0; - virtual void DownloadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size, + virtual void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, u8* data) = 0; - virtual void CopyBlock(const OwnerBuffer& src, const OwnerBuffer& dst, std::size_t src_offset, + virtual void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, std::size_t dst_offset, std::size_t size) = 0; virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { @@ -321,7 +325,7 @@ protected: } private: - MapInterval* MapAddress(const OwnerBuffer& block, GPUVAddr gpu_addr, VAddr cpu_addr, + MapInterval* MapAddress(const Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) { const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size); if (overlaps.empty()) { @@ -329,11 +333,11 @@ private: const VAddr cpu_addr_end = cpu_addr + size; if (memory_manager.IsGranularRange(gpu_addr, size)) { u8* host_ptr = memory_manager.GetPointer(gpu_addr); - UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr); + UploadBlockData(*block, block->Offset(cpu_addr), size, host_ptr); } else { staging_buffer.resize(size); memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); - UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data()); + UploadBlockData(*block, block->Offset(cpu_addr), size, staging_buffer.data()); } return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr)); } @@ -376,7 +380,7 @@ private: return map; } - void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end, + void UpdateBlock(const Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) { const IntervalType base_interval{start, end}; IntervalSet interval_set{}; @@ -386,13 +390,13 @@ private: interval_set.subtract(subtract); } for (auto& interval : interval_set) { - std::size_t size = interval.upper() - interval.lower(); - if (size > 0) { - staging_buffer.resize(size); - system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); - UploadBlockData(block, block->GetOffset(interval.lower()), size, - staging_buffer.data()); + const std::size_t size = interval.upper() - interval.lower(); + if (size == 0) { + continue; } + staging_buffer.resize(size); + system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); + UploadBlockData(*block, block->Offset(interval.lower()), size, staging_buffer.data()); } } @@ -422,10 +426,14 @@ private: } void FlushMap(MapInterval* map) { + const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS); + ASSERT_OR_EXECUTE(it != blocks.end(), return;); + + std::shared_ptr block = it->second; + const std::size_t size = map->end - map->start; - OwnerBuffer block = blocks[map->start >> block_page_bits]; staging_buffer.resize(size); - DownloadBlockData(block, block->GetOffset(map->start), size, staging_buffer.data()); + DownloadBlockData(*block, block->Offset(map->start), size, staging_buffer.data()); system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size); map->MarkAsModified(false, 0); } @@ -448,97 +456,89 @@ private: buffer_offset = offset_aligned; } - OwnerBuffer EnlargeBlock(OwnerBuffer buffer) { - const std::size_t old_size = buffer->GetSize(); - const std::size_t new_size = old_size + block_page_size; - const VAddr cpu_addr = buffer->GetCpuAddr(); - OwnerBuffer new_buffer = CreateBlock(cpu_addr, new_size); - CopyBlock(buffer, new_buffer, 0, 0, old_size); - buffer->SetEpoch(epoch); - pending_destruction.push_back(buffer); + std::shared_ptr EnlargeBlock(std::shared_ptr buffer) { + const std::size_t old_size = buffer->Size(); + const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; + const VAddr cpu_addr = buffer->CpuAddr(); + std::shared_ptr new_buffer = CreateBlock(cpu_addr, new_size); + CopyBlock(*buffer, *new_buffer, 0, 0, old_size); + QueueDestruction(std::move(buffer)); + const VAddr cpu_addr_end = cpu_addr + new_size - 1; - u64 page_start = cpu_addr >> block_page_bits; - const u64 page_end = cpu_addr_end >> block_page_bits; - while (page_start <= page_end) { - blocks[page_start] = new_buffer; - ++page_start; + const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; + for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { + blocks.insert_or_assign(page_start, new_buffer); } + return new_buffer; } - OwnerBuffer MergeBlocks(OwnerBuffer first, OwnerBuffer second) { - const std::size_t size_1 = first->GetSize(); - const std::size_t size_2 = second->GetSize(); - const VAddr first_addr = first->GetCpuAddr(); - const VAddr second_addr = second->GetCpuAddr(); + std::shared_ptr MergeBlocks(std::shared_ptr first, + std::shared_ptr second) { + const std::size_t size_1 = first->Size(); + const std::size_t size_2 = second->Size(); + const VAddr first_addr = first->CpuAddr(); + const VAddr second_addr = second->CpuAddr(); const VAddr new_addr = std::min(first_addr, second_addr); const std::size_t new_size = size_1 + size_2; - OwnerBuffer new_buffer = CreateBlock(new_addr, new_size); - CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1); - CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2); - first->SetEpoch(epoch); - second->SetEpoch(epoch); - pending_destruction.push_back(first); - pending_destruction.push_back(second); + + std::shared_ptr new_buffer = CreateBlock(new_addr, new_size); + CopyBlock(*first, *new_buffer, 0, new_buffer->Offset(first_addr), size_1); + CopyBlock(*second, *new_buffer, 0, new_buffer->Offset(second_addr), size_2); + QueueDestruction(std::move(first)); + QueueDestruction(std::move(second)); + const VAddr cpu_addr_end = new_addr + new_size - 1; - u64 page_start = new_addr >> block_page_bits; - const u64 page_end = cpu_addr_end >> block_page_bits; - while (page_start <= page_end) { - blocks[page_start] = new_buffer; - ++page_start; + const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; + for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { + blocks.insert_or_assign(page_start, new_buffer); } return new_buffer; } - OwnerBuffer GetBlock(const VAddr cpu_addr, const std::size_t size) { - OwnerBuffer found; + Buffer* GetBlock(VAddr cpu_addr, std::size_t size) { + std::shared_ptr found; + const VAddr cpu_addr_end = cpu_addr + size - 1; - u64 page_start = cpu_addr >> block_page_bits; - const u64 page_end = cpu_addr_end >> block_page_bits; - while (page_start <= page_end) { + const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; + for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { auto it = blocks.find(page_start); if (it == blocks.end()) { if (found) { found = EnlargeBlock(found); - } else { - const VAddr start_addr = (page_start << block_page_bits); - found = CreateBlock(start_addr, block_page_size); - blocks[page_start] = found; - } - } else { - if (found) { - if (found == it->second) { - ++page_start; - continue; - } - found = MergeBlocks(found, it->second); - } else { - found = it->second; + continue; } + const VAddr start_addr = page_start << BLOCK_PAGE_BITS; + found = CreateBlock(start_addr, BLOCK_PAGE_SIZE); + blocks.insert_or_assign(page_start, found); + continue; + } + if (!found) { + found = it->second; + continue; + } + if (found != it->second) { + found = MergeBlocks(std::move(found), it->second); } - ++page_start; } - return found; + return found.get(); } - void MarkRegionAsWritten(const VAddr start, const VAddr end) { - u64 page_start = start >> write_page_bit; - const u64 page_end = end >> write_page_bit; - while (page_start <= page_end) { + void MarkRegionAsWritten(VAddr start, VAddr end) { + const u64 page_end = end >> WRITE_PAGE_BIT; + for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { auto it = written_pages.find(page_start); if (it != written_pages.end()) { it->second = it->second + 1; } else { - written_pages[page_start] = 1; + written_pages.insert_or_assign(page_start, 1); } - ++page_start; } } - void UnmarkRegionAsWritten(const VAddr start, const VAddr end) { - u64 page_start = start >> write_page_bit; - const u64 page_end = end >> write_page_bit; - while (page_start <= page_end) { + void UnmarkRegionAsWritten(VAddr start, VAddr end) { + const u64 page_end = end >> WRITE_PAGE_BIT; + for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { auto it = written_pages.find(page_start); if (it != written_pages.end()) { if (it->second > 1) { @@ -547,22 +547,24 @@ private: written_pages.erase(it); } } - ++page_start; } } - bool IsRegionWritten(const VAddr start, const VAddr end) const { - u64 page_start = start >> write_page_bit; - const u64 page_end = end >> write_page_bit; - while (page_start <= page_end) { + bool IsRegionWritten(VAddr start, VAddr end) const { + const u64 page_end = end >> WRITE_PAGE_BIT; + for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { if (written_pages.count(page_start) > 0) { return true; } - ++page_start; } return false; } + void QueueDestruction(std::shared_ptr buffer) { + buffer->SetEpoch(epoch); + pending_destruction.push(std::move(buffer)); + } + void MarkForAsyncFlush(MapInterval* map) { if (!uncommitted_flushes) { uncommitted_flushes = std::make_shared>(); @@ -574,7 +576,7 @@ private: Core::System& system; std::unique_ptr stream_buffer; - BufferType stream_buffer_handle{}; + BufferType stream_buffer_handle; bool invalidated = false; @@ -586,18 +588,15 @@ private: boost::intrusive::set> mapped_addresses; - static constexpr u64 write_page_bit = 11; std::unordered_map written_pages; + std::unordered_map> blocks; - static constexpr u64 block_page_bits = 21; - static constexpr u64 block_page_size = 1ULL << block_page_bits; - std::unordered_map blocks; - - std::list pending_destruction; + std::queue> pending_destruction; u64 epoch = 0; u64 modified_ticks = 0; std::vector staging_buffer; + std::list marked_for_unregister; std::shared_ptr> uncommitted_flushes; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 9964ea8941..ad0577a4f4 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -22,13 +22,12 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs; MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); -CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size) - : VideoCommon::BufferBlock{cpu_addr, size} { +Buffer::Buffer(VAddr cpu_addr, const std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} { gl_buffer.Create(); glNamedBufferData(gl_buffer.handle, static_cast(size), nullptr, GL_DYNAMIC_DRAW); } -CachedBufferBlock::~CachedBufferBlock() = default; +Buffer::~Buffer() = default; OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, const Device& device, std::size_t stream_size) @@ -48,12 +47,8 @@ OGLBufferCache::~OGLBufferCache() { glDeleteBuffers(static_cast(std::size(cbufs)), std::data(cbufs)); } -Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared(cpu_addr, size); -} - -GLuint OGLBufferCache::ToHandle(const Buffer& buffer) { - return buffer->GetHandle(); +std::shared_ptr OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { + return std::make_shared(cpu_addr, size); } GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) { @@ -62,7 +57,7 @@ GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) { void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, const u8* data) { - glNamedBufferSubData(buffer->GetHandle(), static_cast(offset), + glNamedBufferSubData(buffer.Handle(), static_cast(offset), static_cast(size), data); } @@ -70,20 +65,20 @@ void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, u8* data) { MICROPROFILE_SCOPE(OpenGL_Buffer_Download); glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); - glGetNamedBufferSubData(buffer->GetHandle(), static_cast(offset), + glGetNamedBufferSubData(buffer.Handle(), static_cast(offset), static_cast(size), data); } void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, std::size_t dst_offset, std::size_t size) { - glCopyNamedBufferSubData(src->GetHandle(), dst->GetHandle(), static_cast(src_offset), + glCopyNamedBufferSubData(src.Handle(), dst.Handle(), static_cast(src_offset), static_cast(dst_offset), static_cast(size)); } OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, std::size_t size) { DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); - const GLuint& cbuf = cbufs[cbuf_cursor++]; + const GLuint cbuf = cbufs[cbuf_cursor++]; glNamedBufferSubData(cbuf, 0, static_cast(size), raw_pointer); return {cbuf, 0}; } diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index a9e86cfc7c..7168312b17 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -24,17 +24,12 @@ class Device; class OGLStreamBuffer; class RasterizerOpenGL; -class CachedBufferBlock; - -using Buffer = std::shared_ptr; -using GenericBufferCache = VideoCommon::BufferCache; - -class CachedBufferBlock : public VideoCommon::BufferBlock { +class Buffer : public VideoCommon::BufferBlock { public: - explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size); - ~CachedBufferBlock(); + explicit Buffer(VAddr cpu_addr, const std::size_t size); + ~Buffer(); - GLuint GetHandle() const { + GLuint Handle() const { return gl_buffer.handle; } @@ -42,6 +37,7 @@ private: OGLBuffer gl_buffer; }; +using GenericBufferCache = VideoCommon::BufferCache; class OGLBufferCache final : public GenericBufferCache { public: explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, @@ -55,9 +51,7 @@ public: } protected: - Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override; - - GLuint ToHandle(const Buffer& buffer) override; + std::shared_ptr CreateBlock(VAddr cpu_addr, std::size_t size) override; void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, const u8* data) override; diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 6ec328c538..932a2f69ef 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -49,14 +49,6 @@ OGLStreamBuffer::~OGLStreamBuffer() { gl_buffer.Release(); } -GLuint OGLStreamBuffer::GetHandle() const { - return gl_buffer.handle; -} - -GLsizeiptr OGLStreamBuffer::GetSize() const { - return buffer_size; -} - std::tuple OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) { ASSERT(size <= buffer_size); ASSERT(alignment <= buffer_size); diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index f8383cbd45..866da35940 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -17,9 +17,6 @@ public: bool use_persistent = true); ~OGLStreamBuffer(); - GLuint GetHandle() const; - GLsizeiptr GetSize() const; - /* * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes * and the optional alignment requirement. @@ -32,6 +29,14 @@ public: void Unmap(GLsizeiptr size); + GLuint Handle() const { + return gl_buffer.handle; + } + + GLsizeiptr Size() const { + return buffer_size; + } + private: OGLBuffer gl_buffer; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 5f33d9e40e..1fde383280 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -37,8 +37,8 @@ std::unique_ptr CreateStreamBuffer(const VKDevice& device, VKSch } // Anonymous namespace -CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager, - VAddr cpu_addr, std::size_t size) +Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr, + std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} { VkBufferCreateInfo ci; ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; @@ -54,7 +54,7 @@ CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& me buffer.commit = memory_manager.Commit(buffer.handle, false); } -CachedBufferBlock::~CachedBufferBlock() = default; +Buffer::~Buffer() = default; VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, const VKDevice& device, VKMemoryManager& memory_manager, @@ -67,12 +67,8 @@ VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::S VKBufferCache::~VKBufferCache() = default; -Buffer VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared(device, memory_manager, cpu_addr, size); -} - -VkBuffer VKBufferCache::ToHandle(const Buffer& buffer) { - return buffer->GetHandle(); +std::shared_ptr VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { + return std::make_shared(device, memory_manager, cpu_addr, size); } VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) { @@ -91,7 +87,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st std::memcpy(staging.commit->Map(size), data, size); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset, + scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset, size](vk::CommandBuffer cmdbuf) { cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size}); @@ -114,7 +110,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, u8* data) { const auto& staging = staging_pool.GetUnusedBuffer(size, true); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset, + scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset, size](vk::CommandBuffer cmdbuf) { VkBufferMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; @@ -141,8 +137,8 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, std::size_t dst_offset, std::size_t size) { scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([src_buffer = src->GetHandle(), dst_buffer = dst->GetHandle(), src_offset, - dst_offset, size](vk::CommandBuffer cmdbuf) { + scheduler.Record([src_buffer = src.Handle(), dst_buffer = dst.Handle(), src_offset, dst_offset, + size](vk::CommandBuffer cmdbuf) { cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size}); std::array barriers; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index a54583e7d0..cb9734c105 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -24,13 +24,13 @@ class VKDevice; class VKMemoryManager; class VKScheduler; -class CachedBufferBlock final : public VideoCommon::BufferBlock { +class Buffer final : public VideoCommon::BufferBlock { public: - explicit CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager, - VAddr cpu_addr, std::size_t size); - ~CachedBufferBlock(); + explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr, + std::size_t size); + ~Buffer(); - VkBuffer GetHandle() const { + VkBuffer Handle() const { return *buffer.handle; } @@ -38,8 +38,6 @@ private: VKBuffer buffer; }; -using Buffer = std::shared_ptr; - class VKBufferCache final : public VideoCommon::BufferCache { public: explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, @@ -50,9 +48,7 @@ public: VkBuffer GetEmptyBuffer(std::size_t size) override; protected: - VkBuffer ToHandle(const Buffer& buffer) override; - - Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override; + std::shared_ptr CreateBlock(VAddr cpu_addr, std::size_t size) override; void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, const u8* data) override; diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h index dfddf7ad6a..c765c60a06 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.h +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h @@ -35,7 +35,7 @@ public: /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy. void Unmap(u64 size); - VkBuffer GetHandle() const { + VkBuffer Handle() const { return *buffer; } From 74ff1db758a67601b2fa02ea1668ec05f1c5e2ff Mon Sep 17 00:00:00 2001 From: David Marcec Date: Wed, 10 Jun 2020 14:49:00 +1000 Subject: [PATCH 056/145] kernel: Account for system resource size for memory usage GetTotalPhysicalMemoryAvailableWithoutSystemResource & GetTotalPhysicalMemoryUsedWithoutSystemResource seem to subtract the resource size from the usage. --- src/core/hle/kernel/process.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp index 36724569f5..c4c5199b14 100644 --- a/src/core/hle/kernel/process.cpp +++ b/src/core/hle/kernel/process.cpp @@ -132,7 +132,8 @@ std::shared_ptr Process::GetResourceLimit() const { u64 Process::GetTotalPhysicalMemoryAvailable() const { const u64 capacity{resource_limit->GetCurrentResourceValue(ResourceType::PhysicalMemory) + - page_table->GetTotalHeapSize() + image_size + main_thread_stack_size}; + page_table->GetTotalHeapSize() + GetSystemResourceSize() + image_size + + main_thread_stack_size}; if (capacity < memory_usage_capacity) { return capacity; @@ -146,7 +147,8 @@ u64 Process::GetTotalPhysicalMemoryAvailableWithoutSystemResource() const { } u64 Process::GetTotalPhysicalMemoryUsed() const { - return image_size + main_thread_stack_size + page_table->GetTotalHeapSize(); + return image_size + main_thread_stack_size + page_table->GetTotalHeapSize() + + GetSystemResourceSize(); } u64 Process::GetTotalPhysicalMemoryUsedWithoutSystemResource() const { From b15cbf9bcf4e02f182abfd8ff84921f436b8c464 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Wed, 10 Jun 2020 18:36:42 +1000 Subject: [PATCH 057/145] nvdrv: Fix GetTPCMasks for ioctl3 Fixes animal crossing svcBreak on launch --- .../service/nvdrv/devices/nvhost_ctrl_gpu.cpp | 25 +++++++++++-------- .../service/nvdrv/devices/nvhost_ctrl_gpu.h | 18 ++++++------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp index cc2192e5ca..0d913334e8 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp @@ -25,7 +25,7 @@ u32 nvhost_ctrl_gpu::ioctl(Ioctl command, const std::vector& input, case IoctlCommand::IocGetCharacteristicsCommand: return GetCharacteristics(input, output, output2, version); case IoctlCommand::IocGetTPCMasksCommand: - return GetTPCMasks(input, output); + return GetTPCMasks(input, output, output2, version); case IoctlCommand::IocGetActiveSlotMaskCommand: return GetActiveSlotMask(input, output); case IoctlCommand::IocZcullGetCtxSizeCommand: @@ -98,17 +98,22 @@ u32 nvhost_ctrl_gpu::GetCharacteristics(const std::vector& input, std::vecto return 0; } -u32 nvhost_ctrl_gpu::GetTPCMasks(const std::vector& input, std::vector& output) { +u32 nvhost_ctrl_gpu::GetTPCMasks(const std::vector& input, std::vector& output, + std::vector& output2, IoctlVersion version) { IoctlGpuGetTpcMasksArgs params{}; std::memcpy(¶ms, input.data(), input.size()); - LOG_INFO(Service_NVDRV, "called, mask=0x{:X}, mask_buf_addr=0x{:X}", params.mask_buf_size, - params.mask_buf_addr); - // TODO(ogniK): Confirm value on hardware - if (params.mask_buf_size) - params.tpc_mask_size = 4 * 1; // 4 * num_gpc - else - params.tpc_mask_size = 0; - std::memcpy(output.data(), ¶ms, sizeof(params)); + LOG_DEBUG(Service_NVDRV, "called, mask_buffer_size=0x{:X}", params.mask_buffer_size); + if (params.mask_buffer_size != 0) { + params.tcp_mask = 3; + } + + if (version == IoctlVersion::Version3) { + std::memcpy(output.data(), input.data(), output.size()); + std::memcpy(output2.data(), ¶ms.tcp_mask, output2.size()); + } else { + std::memcpy(output.data(), ¶ms, output.size()); + } + return 0; } diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h index 07b644ec55..ef60f72ce9 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h @@ -92,16 +92,11 @@ private: "IoctlCharacteristics is incorrect size"); struct IoctlGpuGetTpcMasksArgs { - /// [in] TPC mask buffer size reserved by userspace. Should be at least - /// sizeof(__u32) * fls(gpc_mask) to receive TPC mask for each GPC. - /// [out] full kernel buffer size - u32_le mask_buf_size; - u32_le reserved; - - /// [in] pointer to TPC mask buffer. It will receive one 32-bit TPC mask per GPC or 0 if - /// GPC is not enabled or not present. This parameter is ignored if mask_buf_size is 0. - u64_le mask_buf_addr; - u64_le tpc_mask_size; // Nintendo add this? + u32_le mask_buffer_size{}; + INSERT_PADDING_WORDS(1); + u64_le mask_buffer_address{}; + u32_le tcp_mask{}; + INSERT_PADDING_WORDS(1); }; static_assert(sizeof(IoctlGpuGetTpcMasksArgs) == 24, "IoctlGpuGetTpcMasksArgs is incorrect size"); @@ -166,7 +161,8 @@ private: u32 GetCharacteristics(const std::vector& input, std::vector& output, std::vector& output2, IoctlVersion version); - u32 GetTPCMasks(const std::vector& input, std::vector& output); + u32 GetTPCMasks(const std::vector& input, std::vector& output, std::vector& output2, + IoctlVersion version); u32 GetActiveSlotMask(const std::vector& input, std::vector& output); u32 ZCullGetCtxSize(const std::vector& input, std::vector& output); u32 ZCullGetInfo(const std::vector& input, std::vector& output); From d89888389db97c25bd3df4ee5ae538312d594404 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 24 May 2020 04:00:40 -0300 Subject: [PATCH 058/145] yuzu/configuration: Show assembly shaders check box --- src/yuzu/configuration/configure_graphics_advanced.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/yuzu/configuration/configure_graphics_advanced.cpp b/src/yuzu/configuration/configure_graphics_advanced.cpp index 37aadf7f8d..be5006ad3b 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.cpp +++ b/src/yuzu/configuration/configure_graphics_advanced.cpp @@ -12,9 +12,6 @@ ConfigureGraphicsAdvanced::ConfigureGraphicsAdvanced(QWidget* parent) ui->setupUi(this); - // TODO: Remove this after assembly shaders are fully integrated - ui->use_assembly_shaders->setVisible(false); - SetConfiguration(); } From a63a0daa5e773574019ec521c0a07096efbdcd36 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Wed, 3 Jun 2020 18:07:35 -0300 Subject: [PATCH 059/145] gl_arb_decompiler: Implement an assembly shader decompiler Emit code compatible with NV_gpu_program5. This should emit code compatible with Fermi, but it wasn't tested on that architecture. Pascal has some issues not present on Turing GPUs. --- CMakeModules/GenerateSCMRev.cmake | 2 + src/common/CMakeLists.txt | 2 + src/video_core/CMakeLists.txt | 2 + .../renderer_opengl/gl_arb_decompiler.cpp | 2051 +++++++++++++++++ .../renderer_opengl/gl_arb_decompiler.h | 29 + src/video_core/renderer_opengl/gl_device.cpp | 1 + src/video_core/renderer_opengl/gl_device.h | 5 + .../renderer_opengl/gl_shader_cache.cpp | 4 +- 8 files changed, 2095 insertions(+), 1 deletion(-) create mode 100644 src/video_core/renderer_opengl/gl_arb_decompiler.cpp create mode 100644 src/video_core/renderer_opengl/gl_arb_decompiler.h diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake index 83e4e9df2e..311ba1c2e2 100644 --- a/CMakeModules/GenerateSCMRev.cmake +++ b/CMakeModules/GenerateSCMRev.cmake @@ -51,6 +51,8 @@ endif() # The variable SRC_DIR must be passed into the script (since it uses the current build directory for all values of CMAKE_*_DIR) set(VIDEO_CORE "${SRC_DIR}/src/video_core") set(HASH_FILES + "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.cpp" + "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.h" "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp" "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h" "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp" diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 24b7a083c1..0a3e2f4d19 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -32,6 +32,8 @@ add_custom_command(OUTPUT scm_rev.cpp DEPENDS # WARNING! It was too much work to try and make a common location for this list, # so if you need to change it, please update CMakeModules/GenerateSCMRev.cmake as well + "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.cpp" + "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.h" "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp" "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h" "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp" diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 2bf8d68ce0..2af713af28 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -54,6 +54,8 @@ add_library(video_core STATIC rasterizer_interface.h renderer_base.cpp renderer_base.h + renderer_opengl/gl_arb_decompiler.cpp + renderer_opengl/gl_arb_decompiler.h renderer_opengl/gl_buffer_cache.cpp renderer_opengl/gl_buffer_cache.h renderer_opengl/gl_device.cpp diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp new file mode 100644 index 0000000000..6a23221bbd --- /dev/null +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp @@ -0,0 +1,2051 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "common/alignment.h" +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/renderer_opengl/gl_arb_decompiler.h" +#include "video_core/renderer_opengl/gl_device.h" +#include "video_core/shader/registry.h" +#include "video_core/shader/shader_ir.h" + +// Predicates in the decompiled code follow the convention that -1 means true and 0 means false. +// GLASM lacks booleans, so they have to be implemented as integers. +// Using -1 for true is useful because both CMP.S and NOT.U can negate it, and CMP.S can be used to +// select between two values, because -1 will be evaluated as true and 0 as false. + +namespace OpenGL { + +namespace { + +using Tegra::Engines::ShaderType; +using Tegra::Shader::Attribute; +using Tegra::Shader::PixelImap; +using Tegra::Shader::Register; +using namespace VideoCommon::Shader; +using Operation = const OperationNode&; + +constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"}; + +char Swizzle(std::size_t component) { + ASSERT(component < 4); + return component["xyzw"]; +} + +constexpr bool IsGenericAttribute(Attribute::Index index) { + return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31; +} + +u32 GetGenericAttributeIndex(Attribute::Index index) { + ASSERT(IsGenericAttribute(index)); + return static_cast(index) - static_cast(Attribute::Index::Attribute_0); +} + +std::string_view Modifiers(Operation operation) { + const auto meta = std::get_if(&operation.GetMeta()); + if (meta && meta->precise) { + return ".PREC"; + } + return ""; +} + +std::string_view GetInputFlags(PixelImap attribute) { + switch (attribute) { + case PixelImap::Perspective: + return ""; + case PixelImap::Constant: + return "FLAT "; + case PixelImap::ScreenLinear: + return "NOPERSPECTIVE "; + case PixelImap::Unused: + break; + } + UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast(attribute)); + return {}; +} + +std::string_view ImageType(Tegra::Shader::ImageType image_type) { + switch (image_type) { + case Tegra::Shader::ImageType::Texture1D: + return "1D"; + case Tegra::Shader::ImageType::TextureBuffer: + return "BUFFER"; + case Tegra::Shader::ImageType::Texture1DArray: + return "ARRAY1D"; + case Tegra::Shader::ImageType::Texture2D: + return "2D"; + case Tegra::Shader::ImageType::Texture2DArray: + return "ARRAY2D"; + case Tegra::Shader::ImageType::Texture3D: + return "3D"; + } + UNREACHABLE(); + return {}; +} + +std::string_view StackName(MetaStackClass stack) { + switch (stack) { + case MetaStackClass::Ssy: + return "SSY"; + case MetaStackClass::Pbk: + return "PBK"; + } + UNREACHABLE(); + return ""; +}; + +std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology topology) { + switch (topology) { + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Points: + return "POINTS"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Lines: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStrip: + return "LINES"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency: + return "LINES_ADJACENCY"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Triangles: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStrip: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleFan: + return "TRIANGLES"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency: + return "TRIANGLES_ADJACENCY"; + default: + UNIMPLEMENTED_MSG("topology={}", static_cast(topology)); + return "POINTS"; + } +} + +std::string_view TopologyName(Tegra::Shader::OutputTopology topology) { + switch (topology) { + case Tegra::Shader::OutputTopology::PointList: + return "POINTS"; + case Tegra::Shader::OutputTopology::LineStrip: + return "LINE_STRIP"; + case Tegra::Shader::OutputTopology::TriangleStrip: + return "TRIANGLE_STRIP"; + default: + UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast(topology)); + return "points"; + } +} + +std::string_view StageInputName(ShaderType stage) { + switch (stage) { + case ShaderType::Vertex: + case ShaderType::Geometry: + return "vertex"; + case ShaderType::Fragment: + return "fragment"; + case ShaderType::Compute: + return "invocation"; + default: + UNREACHABLE(); + return ""; + } +} + +std::string TextureType(const MetaTexture& meta) { + if (meta.sampler.is_buffer) { + return "BUFFER"; + } + std::string type; + if (meta.sampler.is_shadow) { + type += "SHADOW"; + } + if (meta.sampler.is_array) { + type += "ARRAY"; + } + type += [&meta] { + switch (meta.sampler.type) { + case Tegra::Shader::TextureType::Texture1D: + return "1D"; + case Tegra::Shader::TextureType::Texture2D: + return "2D"; + case Tegra::Shader::TextureType::Texture3D: + return "3D"; + case Tegra::Shader::TextureType::TextureCube: + return "CUBE"; + } + UNREACHABLE(); + return "2D"; + }(); + return type; +} + +std::string GlobalMemoryName(const GlobalMemoryBase& base) { + return fmt::format("gmem{}_{}", base.cbuf_index, base.cbuf_offset); +} + +class ARBDecompiler final { +public: + explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, + ShaderType stage, std::string_view identifier); + + std::string Code() const { + return shader_source; + } + +private: + void DeclareHeader(); + void DeclareVertex(); + void DeclareGeometry(); + void DeclareFragment(); + void DeclareCompute(); + void DeclareInputAttributes(); + void DeclareOutputAttributes(); + void DeclareLocalMemory(); + void DeclareGlobalMemory(); + void DeclareConstantBuffers(); + void DeclareRegisters(); + void DeclareTemporaries(); + void DeclarePredicates(); + void DeclareInternalFlags(); + + void InitializeVariables(); + + void DecompileAST(); + void DecompileBranchMode(); + + void VisitAST(const ASTNode& node); + std::string VisitExpression(const Expr& node); + + void VisitBlock(const NodeBlock& bb); + + std::string Visit(const Node& node); + + std::pair BuildCoords(Operation); + std::string BuildAoffi(Operation); + void Exit(); + + std::string Assign(Operation); + std::string Select(Operation); + std::string FClamp(Operation); + std::string FCastHalf0(Operation); + std::string FCastHalf1(Operation); + std::string FSqrt(Operation); + std::string FSwizzleAdd(Operation); + std::string HAdd2(Operation); + std::string HMul2(Operation); + std::string HFma2(Operation); + std::string HAbsolute(Operation); + std::string HNegate(Operation); + std::string HClamp(Operation); + std::string HCastFloat(Operation); + std::string HUnpack(Operation); + std::string HMergeF32(Operation); + std::string HMergeH0(Operation); + std::string HMergeH1(Operation); + std::string HPack2(Operation); + std::string LogicalAssign(Operation); + std::string LogicalPick2(Operation); + std::string LogicalAnd2(Operation); + std::string FloatOrdered(Operation); + std::string FloatUnordered(Operation); + std::string LogicalAddCarry(Operation); + std::string Texture(Operation); + std::string TextureGather(Operation); + std::string TextureQueryDimensions(Operation); + std::string TextureQueryLod(Operation); + std::string TexelFetch(Operation); + std::string TextureGradient(Operation); + std::string ImageLoad(Operation); + std::string ImageStore(Operation); + std::string Branch(Operation); + std::string BranchIndirect(Operation); + std::string PushFlowStack(Operation); + std::string PopFlowStack(Operation); + std::string Exit(Operation); + std::string Discard(Operation); + std::string EmitVertex(Operation); + std::string EndPrimitive(Operation); + std::string InvocationId(Operation); + std::string YNegate(Operation); + std::string ThreadId(Operation); + std::string ShuffleIndexed(Operation); + std::string Barrier(Operation); + std::string MemoryBarrierGroup(Operation); + std::string MemoryBarrierGlobal(Operation); + + template + std::string Unary(Operation operation) { + const std::string temporary = AllocTemporary(); + AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0])); + return temporary; + } + + template + std::string Binary(Operation operation) { + const std::string temporary = AllocTemporary(); + AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]), + Visit(operation[1])); + return temporary; + } + + template + std::string Trinary(Operation operation) { + const std::string temporary = AllocTemporary(); + AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]), + Visit(operation[1]), Visit(operation[2])); + return temporary; + } + + template + std::string FloatComparison(Operation operation) { + const std::string temporary = AllocTemporary(); + AddLine("TRUNC.U.CC RC.x, {};", Binary(operation)); + AddLine("MOV.S {}, 0;", temporary); + AddLine("MOV.S {} (NE.x), -1;", temporary); + + const std::string op_a = Visit(operation[0]); + const std::string op_b = Visit(operation[1]); + if constexpr (unordered) { + AddLine("SNE.F RC.x, {}, {};", op_a, op_a); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), -1;", temporary); + AddLine("SNE.F RC.x, {}, {};", op_b, op_b); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), -1;", temporary); + } else if (op == SNE_F) { + AddLine("SNE.F RC.x, {}, {};", op_a, op_a); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), 0;", temporary); + AddLine("SNE.F RC.x, {}, {};", op_b, op_b); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), 0;", temporary); + } + return temporary; + } + + template + std::string HalfComparison(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + const std::string op_a = Visit(operation[0]); + const std::string op_b = Visit(operation[1]); + AddLine("UP2H.F {}, {};", tmp1, op_a); + AddLine("UP2H.F {}, {};", tmp2, op_b); + AddLine("{} {}, {}, {};", op, tmp1, tmp1, tmp2); + AddLine("TRUNC.U.CC RC.xy, {};", tmp1); + AddLine("MOV.S {}.xy, {{0, 0, 0, 0}};", tmp1); + AddLine("MOV.S {}.x (NE.x), -1;", tmp1); + AddLine("MOV.S {}.y (NE.y), -1;", tmp1); + if constexpr (is_nan) { + AddLine("MOVC.F RC.x, {};", op_a); + AddLine("MOV.S {}.x (NAN.x), -1;", tmp1); + AddLine("MOVC.F RC.x, {};", op_b); + AddLine("MOV.S {}.y (NAN.x), -1;", tmp1); + } + return tmp1; + } + + template + std::string AtomicImage(Operation operation) { + const auto& meta = std::get(operation.GetMeta()); + const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index; + const std::size_t num_coords = operation.GetOperandsCount(); + const std::size_t num_values = meta.values.size(); + + const std::string coord = AllocVectorTemporary(); + const std::string value = AllocVectorTemporary(); + for (std::size_t i = 0; i < num_coords; ++i) { + AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i])); + } + for (std::size_t i = 0; i < num_values; ++i) { + AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i])); + } + + const std::string result = coord; + AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, result, value, coord, + image_id, ImageType(meta.image.type)); + return fmt::format("{}.x", result); + } + + template + std::string Atomic(Operation operation) { + const std::string temporary = AllocTemporary(); + std::string address; + std::string_view opname; + if (const auto gmem = std::get_if(&*operation[0])) { + AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()), + Visit(gmem->GetBaseAddress())); + address = fmt::format("{}[{}]", GlobalMemoryName(gmem->GetDescriptor()), temporary); + opname = "ATOMB"; + } else if (const auto smem = std::get_if(&*operation[0])) { + address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress())); + opname = "ATOMS"; + } else { + UNREACHABLE(); + return "{0, 0, 0, 0}"; + } + AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address); + return temporary; + } + + template + std::string Negate(Operation operation) { + const std::string temporary = AllocTemporary(); + if constexpr (type == 'F') { + AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0])); + } else { + AddLine("MOV.{} {}, -{};", type, temporary, Visit(operation[0])); + } + return temporary; + } + + template + std::string Absolute(Operation operation) { + const std::string temporary = AllocTemporary(); + AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0])); + return temporary; + } + + template + std::string BitfieldInsert(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[3])); + AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[2])); + AddLine("BFI.{} {}.x, {}, {}, {};", type, temporary, temporary, Visit(operation[1]), + Visit(operation[0])); + return fmt::format("{}.x", temporary); + } + + template + std::string BitfieldExtract(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[2])); + AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[1])); + AddLine("BFE.{} {}.x, {}, {};", type, temporary, temporary, Visit(operation[0])); + return fmt::format("{}.x", temporary); + } + + template + std::string LocalInvocationId(Operation) { + return fmt::format("invocation.localid.{}", swizzle); + } + + template + std::string WorkGroupId(Operation) { + return fmt::format("invocation.groupid.{}", swizzle); + } + + template + std::string ThreadMask(Operation) { + return fmt::format("{}.thread{}{}mask", StageInputName(stage), c1, c2); + } + + template + void AddExpression(std::string_view text, Args&&... args) { + shader_source += fmt::format(text, std::forward(args)...); + } + + template + void AddLine(std::string_view text, Args&&... args) { + AddExpression(text, std::forward(args)...); + shader_source += '\n'; + } + + std::string AllocTemporary() { + max_temporaries = std::max(max_temporaries, num_temporaries + 1); + return fmt::format("T{}.x", num_temporaries++); + } + + std::string AllocVectorTemporary() { + max_temporaries = std::max(max_temporaries, num_temporaries + 1); + return fmt::format("T{}", num_temporaries++); + } + + void ResetTemporaries() noexcept { + num_temporaries = 0; + } + + const Device& device; + const ShaderIR& ir; + const Registry& registry; + const ShaderType stage; + + std::size_t num_temporaries = 0; + std::size_t max_temporaries = 0; + + std::string shader_source; + + static constexpr std::string_view ADD_F32 = "ADD.F32"; + static constexpr std::string_view ADD_S = "ADD.S"; + static constexpr std::string_view ADD_U = "ADD.U"; + static constexpr std::string_view MUL_F32 = "MUL.F32"; + static constexpr std::string_view MUL_S = "MUL.S"; + static constexpr std::string_view MUL_U = "MUL.U"; + static constexpr std::string_view DIV_F32 = "DIV.F32"; + static constexpr std::string_view DIV_S = "DIV.S"; + static constexpr std::string_view DIV_U = "DIV.U"; + static constexpr std::string_view MAD_F32 = "MAD.F32"; + static constexpr std::string_view RSQ_F32 = "RSQ.F32"; + static constexpr std::string_view COS_F32 = "COS.F32"; + static constexpr std::string_view SIN_F32 = "SIN.F32"; + static constexpr std::string_view EX2_F32 = "EX2.F32"; + static constexpr std::string_view LG2_F32 = "LG2.F32"; + static constexpr std::string_view SLT_F = "SLT.F32"; + static constexpr std::string_view SLT_S = "SLT.S"; + static constexpr std::string_view SLT_U = "SLT.U"; + static constexpr std::string_view SEQ_F = "SEQ.F32"; + static constexpr std::string_view SEQ_S = "SEQ.S"; + static constexpr std::string_view SEQ_U = "SEQ.U"; + static constexpr std::string_view SLE_F = "SLE.F32"; + static constexpr std::string_view SLE_S = "SLE.S"; + static constexpr std::string_view SLE_U = "SLE.U"; + static constexpr std::string_view SGT_F = "SGT.F32"; + static constexpr std::string_view SGT_S = "SGT.S"; + static constexpr std::string_view SGT_U = "SGT.U"; + static constexpr std::string_view SNE_F = "SNE.F32"; + static constexpr std::string_view SNE_S = "SNE.S"; + static constexpr std::string_view SNE_U = "SNE.U"; + static constexpr std::string_view SGE_F = "SGE.F32"; + static constexpr std::string_view SGE_S = "SGE.S"; + static constexpr std::string_view SGE_U = "SGE.U"; + static constexpr std::string_view AND_S = "AND.S"; + static constexpr std::string_view AND_U = "AND.U"; + static constexpr std::string_view TRUNC_F = "TRUNC.F"; + static constexpr std::string_view TRUNC_S = "TRUNC.S"; + static constexpr std::string_view TRUNC_U = "TRUNC.U"; + static constexpr std::string_view SHL_S = "SHL.S"; + static constexpr std::string_view SHL_U = "SHL.U"; + static constexpr std::string_view SHR_S = "SHR.S"; + static constexpr std::string_view SHR_U = "SHR.U"; + static constexpr std::string_view OR_S = "OR.S"; + static constexpr std::string_view OR_U = "OR.U"; + static constexpr std::string_view XOR_S = "XOR.S"; + static constexpr std::string_view XOR_U = "XOR.U"; + static constexpr std::string_view NOT_S = "NOT.S"; + static constexpr std::string_view NOT_U = "NOT.U"; + static constexpr std::string_view BTC_S = "BTC.S"; + static constexpr std::string_view BTC_U = "BTC.U"; + static constexpr std::string_view BTFM_S = "BTFM.S"; + static constexpr std::string_view BTFM_U = "BTFM.U"; + static constexpr std::string_view ROUND_F = "ROUND.F"; + static constexpr std::string_view CEIL_F = "CEIL.F"; + static constexpr std::string_view FLR_F = "FLR.F"; + static constexpr std::string_view I2F_S = "I2F.S"; + static constexpr std::string_view I2F_U = "I2F.U"; + static constexpr std::string_view MIN_F = "MIN.F"; + static constexpr std::string_view MIN_S = "MIN.S"; + static constexpr std::string_view MIN_U = "MIN.U"; + static constexpr std::string_view MAX_F = "MAX.F"; + static constexpr std::string_view MAX_S = "MAX.S"; + static constexpr std::string_view MAX_U = "MAX.U"; + static constexpr std::string_view MOV_U = "MOV.U"; + static constexpr std::string_view TGBALLOT_U = "TGBALLOT.U"; + static constexpr std::string_view TGALL_U = "TGALL.U"; + static constexpr std::string_view TGANY_U = "TGANY.U"; + static constexpr std::string_view TGEQ_U = "TGEQ.U"; + static constexpr std::string_view EXCH = "EXCH"; + static constexpr std::string_view ADD = "ADD"; + static constexpr std::string_view MIN = "MIN"; + static constexpr std::string_view MAX = "MAX"; + static constexpr std::string_view AND = "AND"; + static constexpr std::string_view OR = "OR"; + static constexpr std::string_view XOR = "XOR"; + static constexpr std::string_view U32 = "U32"; + static constexpr std::string_view S32 = "S32"; + + static constexpr std::size_t NUM_ENTRIES = static_cast(OperationCode::Amount); + using DecompilerType = std::string (ARBDecompiler::*)(Operation); + static constexpr std::array OPERATION_DECOMPILERS = { + &ARBDecompiler::Assign, + + &ARBDecompiler::Select, + + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Trinary, + &ARBDecompiler::Negate<'F'>, + &ARBDecompiler::Absolute<'F'>, + &ARBDecompiler::FClamp, + &ARBDecompiler::FCastHalf0, + &ARBDecompiler::FCastHalf1, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + &ARBDecompiler::FSqrt, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + &ARBDecompiler::FSwizzleAdd, + + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Negate<'S'>, + &ARBDecompiler::Absolute<'S'>, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Unary, + &ARBDecompiler::BitfieldInsert<'S'>, + &ARBDecompiler::BitfieldExtract<'S'>, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Unary, + &ARBDecompiler::BitfieldInsert<'U'>, + &ARBDecompiler::BitfieldExtract<'U'>, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + + &ARBDecompiler::HAdd2, + &ARBDecompiler::HMul2, + &ARBDecompiler::HFma2, + &ARBDecompiler::HAbsolute, + &ARBDecompiler::HNegate, + &ARBDecompiler::HClamp, + &ARBDecompiler::HCastFloat, + &ARBDecompiler::HUnpack, + &ARBDecompiler::HMergeF32, + &ARBDecompiler::HMergeH0, + &ARBDecompiler::HMergeH1, + &ARBDecompiler::HPack2, + + &ARBDecompiler::LogicalAssign, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Unary, + &ARBDecompiler::LogicalPick2, + &ARBDecompiler::LogicalAnd2, + + &ARBDecompiler::FloatComparison, + &ARBDecompiler::FloatComparison, + &ARBDecompiler::FloatComparison, + &ARBDecompiler::FloatComparison, + &ARBDecompiler::FloatComparison, + &ARBDecompiler::FloatComparison, + &ARBDecompiler::FloatOrdered, + &ARBDecompiler::FloatUnordered, + &ARBDecompiler::FloatComparison, + &ARBDecompiler::FloatComparison, + &ARBDecompiler::FloatComparison, + &ARBDecompiler::FloatComparison, + &ARBDecompiler::FloatComparison, + &ARBDecompiler::FloatComparison, + + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + &ARBDecompiler::Binary, + + &ARBDecompiler::LogicalAddCarry, + + &ARBDecompiler::HalfComparison, + &ARBDecompiler::HalfComparison, + &ARBDecompiler::HalfComparison, + &ARBDecompiler::HalfComparison, + &ARBDecompiler::HalfComparison, + &ARBDecompiler::HalfComparison, + &ARBDecompiler::HalfComparison, + &ARBDecompiler::HalfComparison, + &ARBDecompiler::HalfComparison, + &ARBDecompiler::HalfComparison, + &ARBDecompiler::HalfComparison, + &ARBDecompiler::HalfComparison, + + &ARBDecompiler::Texture, + &ARBDecompiler::Texture, + &ARBDecompiler::TextureGather, + &ARBDecompiler::TextureQueryDimensions, + &ARBDecompiler::TextureQueryLod, + &ARBDecompiler::TexelFetch, + &ARBDecompiler::TextureGradient, + + &ARBDecompiler::ImageLoad, + &ARBDecompiler::ImageStore, + + &ARBDecompiler::AtomicImage, + &ARBDecompiler::AtomicImage, + &ARBDecompiler::AtomicImage, + &ARBDecompiler::AtomicImage, + &ARBDecompiler::AtomicImage, + + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + &ARBDecompiler::Atomic, + + &ARBDecompiler::Branch, + &ARBDecompiler::BranchIndirect, + &ARBDecompiler::PushFlowStack, + &ARBDecompiler::PopFlowStack, + &ARBDecompiler::Exit, + &ARBDecompiler::Discard, + + &ARBDecompiler::EmitVertex, + &ARBDecompiler::EndPrimitive, + + &ARBDecompiler::InvocationId, + &ARBDecompiler::YNegate, + &ARBDecompiler::LocalInvocationId<'x'>, + &ARBDecompiler::LocalInvocationId<'y'>, + &ARBDecompiler::LocalInvocationId<'z'>, + &ARBDecompiler::WorkGroupId<'x'>, + &ARBDecompiler::WorkGroupId<'y'>, + &ARBDecompiler::WorkGroupId<'z'>, + + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + &ARBDecompiler::Unary, + + &ARBDecompiler::ThreadId, + &ARBDecompiler::ThreadMask<'e', 'q'>, + &ARBDecompiler::ThreadMask<'g', 'e'>, + &ARBDecompiler::ThreadMask<'g', 't'>, + &ARBDecompiler::ThreadMask<'l', 'e'>, + &ARBDecompiler::ThreadMask<'l', 't'>, + &ARBDecompiler::ShuffleIndexed, + + &ARBDecompiler::Barrier, + &ARBDecompiler::MemoryBarrierGroup, + &ARBDecompiler::MemoryBarrierGlobal, + }; +}; + +ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, + ShaderType stage, std::string_view identifier) + : device{device}, ir{ir}, registry{registry}, stage{stage} { + AddLine("TEMP RC;"); + if (ir.IsDecompiled()) { + DecompileAST(); + } else { + DecompileBranchMode(); + } + AddLine("END"); + + const std::string code = std::move(shader_source); + DeclareHeader(); + DeclareVertex(); + DeclareGeometry(); + DeclareFragment(); + DeclareCompute(); + DeclareInputAttributes(); + DeclareOutputAttributes(); + DeclareLocalMemory(); + DeclareGlobalMemory(); + DeclareConstantBuffers(); + DeclareRegisters(); + DeclareTemporaries(); + DeclarePredicates(); + DeclareInternalFlags(); + + shader_source += code; +} + +std::string_view HeaderStageName(ShaderType stage) { + switch (stage) { + case ShaderType::Vertex: + return "vp"; + case ShaderType::Geometry: + return "gp"; + case ShaderType::Fragment: + return "fp"; + case ShaderType::Compute: + return "cp"; + default: + UNREACHABLE(); + return ""; + } +} + +void ARBDecompiler::DeclareHeader() { + AddLine("!!NV{}5.0", HeaderStageName(stage)); + // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D + AddLine("OPTION NV_internal;"); + AddLine("OPTION NV_gpu_program_fp64;"); + AddLine("OPTION NV_shader_storage_buffer;"); + AddLine("OPTION NV_shader_thread_group;"); + if (ir.UsesWarps() && device.HasWarpIntrinsics()) { + AddLine("OPTION NV_shader_thread_shuffle;"); + } + if (stage == ShaderType::Vertex) { + if (device.HasNvViewportArray2()) { + AddLine("OPTION NV_viewport_array2;"); + } + } + if (stage == ShaderType::Fragment) { + AddLine("OPTION ARB_draw_buffers;"); + } + if (device.HasImageLoadFormatted()) { + AddLine("OPTION EXT_shader_image_load_formatted;"); + } +} + +void ARBDecompiler::DeclareVertex() { + if (stage != ShaderType::Vertex) { + return; + } + AddLine("OUTPUT result_clip[] = {{ result.clip[0..7] }};"); +} + +void ARBDecompiler::DeclareGeometry() { + if (stage != ShaderType::Geometry) { + return; + } + const auto& info = registry.GetGraphicsInfo(); + const auto& header = ir.GetHeader(); + AddLine("PRIMITIVE_IN {};", PrimitiveDescription(info.primitive_topology)); + AddLine("PRIMITIVE_OUT {};", TopologyName(header.common3.output_topology)); + AddLine("VERTICES_OUT {};", header.common4.max_output_vertices.Value()); + AddLine("ATTRIB vertex_position = vertex.position;"); +} + +void ARBDecompiler::DeclareFragment() { + if (stage != ShaderType::Fragment) { + return; + } + AddLine("OUTPUT result_color7 = result.color[7];"); + AddLine("OUTPUT result_color6 = result.color[6];"); + AddLine("OUTPUT result_color5 = result.color[5];"); + AddLine("OUTPUT result_color4 = result.color[4];"); + AddLine("OUTPUT result_color3 = result.color[3];"); + AddLine("OUTPUT result_color2 = result.color[2];"); + AddLine("OUTPUT result_color1 = result.color[1];"); + AddLine("OUTPUT result_color0 = result.color;"); +} + +void ARBDecompiler::DeclareCompute() { + if (stage != ShaderType::Compute) { + return; + } + const ComputeInfo& info = registry.GetComputeInfo(); + AddLine("GROUP_SIZE {} {} {};", info.workgroup_size[0], info.workgroup_size[1], + info.workgroup_size[2]); + if (info.shared_memory_size_in_words > 0) { + const u32 size_in_bytes = info.shared_memory_size_in_words * 4; + AddLine("SHARED_MEMORY {};", size_in_bytes); + AddLine("SHARED shared_mem[] = {{program.sharedmem}};"); + } +} + +void ARBDecompiler::DeclareInputAttributes() { + if (stage == ShaderType::Compute) { + return; + } + const std::string_view stage_name = StageInputName(stage); + for (const auto attribute : ir.GetInputAttributes()) { + if (!IsGenericAttribute(attribute)) { + continue; + } + const u32 index = GetGenericAttributeIndex(attribute); + + std::string_view suffix; + if (stage == ShaderType::Fragment) { + const auto input_mode{ir.GetHeader().ps.GetPixelImap(index)}; + if (input_mode == PixelImap::Unused) { + return; + } + suffix = GetInputFlags(input_mode); + } + AddLine("{}ATTRIB in_attr{}[] = {{ {}.attrib[{}..{}] }};", suffix, index, stage_name, index, + index); + } +} + +void ARBDecompiler::DeclareOutputAttributes() { + if (stage == ShaderType::Compute) { + return; + } + for (const auto attribute : ir.GetOutputAttributes()) { + if (!IsGenericAttribute(attribute)) { + continue; + } + const u32 index = GetGenericAttributeIndex(attribute); + AddLine("OUTPUT out_attr{}[] = {{ result.attrib[{}..{}] }};", index, index, index); + } +} + +void ARBDecompiler::DeclareLocalMemory() { + u64 size = 0; + if (stage == ShaderType::Compute) { + size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL; + } else { + size = ir.GetHeader().GetLocalMemorySize(); + } + if (size == 0) { + return; + } + const u64 element_count = Common::AlignUp(size, 4) / 4; + AddLine("TEMP lmem[{}];", element_count); +} + +void ARBDecompiler::DeclareGlobalMemory() { + u32 binding = 0; // device.GetBaseBindings(stage).shader_storage_buffer; + for (const auto& pair : ir.GetGlobalMemory()) { + const auto& base = pair.first; + AddLine("STORAGE {}[] = {{ program.storage[{}] }};", GlobalMemoryName(base), binding); + ++binding; + } +} + +void ARBDecompiler::DeclareConstantBuffers() { + u32 binding = 0; + for (const auto& cbuf : ir.GetConstantBuffers()) { + AddLine("CBUFFER cbuf{}[] = {{ program.buffer[{}] }};", cbuf.first, binding); + ++binding; + } +} + +void ARBDecompiler::DeclareRegisters() { + for (const u32 gpr : ir.GetRegisters()) { + AddLine("TEMP R{};", gpr); + } +} + +void ARBDecompiler::DeclareTemporaries() { + for (std::size_t i = 0; i < max_temporaries; ++i) { + AddLine("TEMP T{};", i); + } +} + +void ARBDecompiler::DeclarePredicates() { + for (const Tegra::Shader::Pred pred : ir.GetPredicates()) { + AddLine("TEMP P{};", static_cast(pred)); + } +} + +void ARBDecompiler::DeclareInternalFlags() { + for (const char* name : INTERNAL_FLAG_NAMES) { + AddLine("TEMP {};", name); + } +} + +void ARBDecompiler::InitializeVariables() { + if (stage == ShaderType::Vertex || stage == ShaderType::Geometry) { + AddLine("MOV.F result.position, {{0, 0, 0, 1}};"); + } + for (const auto attribute : ir.GetOutputAttributes()) { + if (!IsGenericAttribute(attribute)) { + continue; + } + const u32 index = GetGenericAttributeIndex(attribute); + AddLine("MOV.F result.attrib[{}], {{0, 0, 0, 1}};", index); + } + for (const u32 gpr : ir.GetRegisters()) { + AddLine("MOV.F R{}, {{0, 0, 0, 0}};", gpr); + } + for (const Tegra::Shader::Pred pred : ir.GetPredicates()) { + AddLine("MOV.U P{}, {{0, 0, 0, 0}};", static_cast(pred)); + } +} + +void ARBDecompiler::DecompileAST() { + const u32 num_flow_variables = ir.GetASTNumVariables(); + for (u32 i = 0; i < num_flow_variables; ++i) { + AddLine("TEMP F{};", i); + } + for (u32 i = 0; i < num_flow_variables; ++i) { + AddLine("MOV.U F{}, {{0, 0, 0, 0}};", i); + } + + InitializeVariables(); + + VisitAST(ir.GetASTProgram()); +} + +void ARBDecompiler::DecompileBranchMode() { + static constexpr u32 FLOW_STACK_SIZE = 20; + if (!ir.IsFlowStackDisabled()) { + AddLine("TEMP SSY[{}];", FLOW_STACK_SIZE); + AddLine("TEMP PBK[{}];", FLOW_STACK_SIZE); + AddLine("TEMP SSY_TOP;"); + AddLine("TEMP PBK_TOP;"); + } + + AddLine("TEMP PC;"); + + if (!ir.IsFlowStackDisabled()) { + AddLine("MOV.U SSY_TOP.x, 0;"); + AddLine("MOV.U PBK_TOP.x, 0;"); + } + + InitializeVariables(); + + const auto basic_block_end = ir.GetBasicBlocks().end(); + auto basic_block_it = ir.GetBasicBlocks().begin(); + const u32 first_address = basic_block_it->first; + AddLine("MOV.U PC.x, {};", first_address); + + AddLine("REP;"); + + std::size_t num_blocks = 0; + while (basic_block_it != basic_block_end) { + const auto& [address, bb] = *basic_block_it; + ++num_blocks; + + AddLine("SEQ.S.CC RC.x, PC.x, {};", address); + AddLine("IF NE.x;"); + + VisitBlock(bb); + + ++basic_block_it; + + if (basic_block_it != basic_block_end) { + const auto op = std::get_if(&*bb[bb.size() - 1]); + if (!op || op->GetCode() != OperationCode::Branch) { + const u32 next_address = basic_block_it->first; + AddLine("MOV.U PC.x, {};", next_address); + AddLine("CONT;"); + } + } + + AddLine("ELSE;"); + } + AddLine("RET;"); + while (num_blocks--) { + AddLine("ENDIF;"); + } + + AddLine("ENDREP;"); +} + +void ARBDecompiler::VisitAST(const ASTNode& node) { + if (const auto ast = std::get_if(&*node->GetInnerData())) { + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + } else if (const auto ast = std::get_if(&*node->GetInnerData())) { + const std::string condition = VisitExpression(ast->condition); + ResetTemporaries(); + + AddLine("MOVC.U RC.x, {};", condition); + AddLine("IF NE.x;"); + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + AddLine("ENDIF;"); + } else if (const auto ast = std::get_if(&*node->GetInnerData())) { + AddLine("ELSE;"); + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + } else if (const auto ast = std::get_if(&*node->GetInnerData())) { + VisitBlock(ast->nodes); + } else if (const auto ast = std::get_if(&*node->GetInnerData())) { + AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition)); + ResetTemporaries(); + } else if (const auto ast = std::get_if(&*node->GetInnerData())) { + const std::string condition = VisitExpression(ast->condition); + ResetTemporaries(); + AddLine("REP;"); + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + AddLine("MOVC.U RC.x, {};", condition); + AddLine("BRK (NE.x);"); + AddLine("ENDREP;"); + } else if (const auto ast = std::get_if(&*node->GetInnerData())) { + const bool is_true = ExprIsTrue(ast->condition); + if (!is_true) { + AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition)); + AddLine("IF NE.x;"); + ResetTemporaries(); + } + if (ast->kills) { + AddLine("KIL TR;"); + } else { + Exit(); + } + if (!is_true) { + AddLine("ENDIF;"); + } + } else if (const auto ast = std::get_if(&*node->GetInnerData())) { + if (ExprIsTrue(ast->condition)) { + AddLine("BRK;"); + } else { + AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition)); + AddLine("BRK (NE.x);"); + ResetTemporaries(); + } + } else if (std::holds_alternative(*node->GetInnerData())) { + // Nothing to do + } else { + UNREACHABLE(); + } +} + +std::string ARBDecompiler::VisitExpression(const Expr& node) { + const std::string result = AllocTemporary(); + if (const auto expr = std::get_if(&*node)) { + AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1), + VisitExpression(expr->operand2)); + return result; + } + if (const auto expr = std::get_if(&*node)) { + const std::string result = AllocTemporary(); + AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1), + VisitExpression(expr->operand2)); + return result; + } + if (const auto expr = std::get_if(&*node)) { + const std::string result = AllocTemporary(); + AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1)); + return result; + } + if (const auto expr = std::get_if(&*node)) { + return fmt::format("P{}.x", static_cast(expr->predicate)); + } + if (const auto expr = std::get_if(&*node)) { + return Visit(ir.GetConditionCode(expr->cc)); + } + if (const auto expr = std::get_if(&*node)) { + return fmt::format("F{}.x", expr->var_index); + } + if (const auto expr = std::get_if(&*node)) { + return expr->value ? "0xffffffff" : "0"; + } + if (const auto expr = std::get_if(&*node)) { + const std::string result = AllocTemporary(); + AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value); + return result; + } + UNREACHABLE(); + return "0"; +} + +void ARBDecompiler::VisitBlock(const NodeBlock& bb) { + for (const auto& node : bb) { + Visit(node); + } +} + +std::string ARBDecompiler::Visit(const Node& node) { + if (const auto operation = std::get_if(&*node)) { + if (const auto amend_index = operation->GetAmendIndex()) { + Visit(ir.GetAmendNode(*amend_index)); + } + const std::size_t index = static_cast(operation->GetCode()); + if (index >= OPERATION_DECOMPILERS.size()) { + UNREACHABLE_MSG("Out of bounds operation: {}", index); + return {}; + } + const auto decompiler = OPERATION_DECOMPILERS[index]; + if (decompiler == nullptr) { + UNREACHABLE_MSG("Undefined operation: {}", index); + return {}; + } + return (this->*decompiler)(*operation); + } + + if (const auto gpr = std::get_if(&*node)) { + const u32 index = gpr->GetIndex(); + if (index == Register::ZeroIndex) { + return "{0, 0, 0, 0}.x"; + } + return fmt::format("R{}.x", index); + } + + if (const auto cv = std::get_if(&*node)) { + return fmt::format("CV{}.x", cv->GetIndex()); + } + + if (const auto immediate = std::get_if(&*node)) { + const std::string temporary = AllocTemporary(); + AddLine("MOV.U {}, {};", temporary, immediate->GetValue()); + return temporary; + } + + if (const auto predicate = std::get_if(&*node)) { + const std::string temporary = AllocTemporary(); + switch (const auto index = predicate->GetIndex(); index) { + case Tegra::Shader::Pred::UnusedIndex: + AddLine("MOV.S {}, -1;", temporary); + break; + case Tegra::Shader::Pred::NeverExecute: + AddLine("MOV.S {}, 0;", temporary); + break; + default: + AddLine("MOV.S {}, P{}.x;", temporary, static_cast(index)); + break; + } + if (predicate->IsNegated()) { + AddLine("CMP.S {}, {}, 0, -1;", temporary, temporary); + } + return temporary; + } + + if (const auto abuf = std::get_if(&*node)) { + if (abuf->IsPhysicalBuffer()) { + UNIMPLEMENTED_MSG("Physical buffers are not implemented"); + return "{0, 0, 0, 0}.x"; + } + + const auto buffer_index = [this, &abuf]() -> std::string { + if (stage != ShaderType::Geometry) { + return ""; + } + return fmt::format("[{}]", Visit(abuf->GetBuffer())); + }; + + const Attribute::Index index = abuf->GetIndex(); + const u32 element = abuf->GetElement(); + const char swizzle = Swizzle(element); + switch (index) { + case Attribute::Index::Position: { + if (stage == ShaderType::Geometry) { + return fmt::format("{}_position[{}].{}", StageInputName(stage), + Visit(abuf->GetBuffer()), swizzle); + } else { + return fmt::format("{}.position.{}", StageInputName(stage), swizzle); + } + } + case Attribute::Index::TessCoordInstanceIDVertexID: + ASSERT(stage == ShaderType::Vertex); + switch (element) { + case 2: + return "vertex.instance"; + case 3: + return "vertex.id"; + } + UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element); + break; + case Attribute::Index::PointCoord: + switch (element) { + case 0: + return "fragment.pointcoord.x"; + case 1: + return "fragment.pointcoord.y"; + } + UNIMPLEMENTED(); + break; + case Attribute::Index::FrontFacing: { + ASSERT(stage == ShaderType::Fragment); + ASSERT(element == 3); + const std::string temporary = AllocVectorTemporary(); + AddLine("SGT.S RC.x, fragment.facing, {{0, 0, 0, 0}};"); + AddLine("MOV.U.CC RC.x, -RC;"); + AddLine("MOV.S {}.x, 0;", temporary); + AddLine("MOV.S {}.x (NE.x), -1;", temporary); + return fmt::format("{}.x", temporary); + } + default: + if (IsGenericAttribute(index)) { + if (stage == ShaderType::Geometry) { + return fmt::format("in_attr{}[{}][0].{}", GetGenericAttributeIndex(index), + Visit(abuf->GetBuffer()), swizzle); + } else { + return fmt::format("{}.attrib[{}].{}", StageInputName(stage), + GetGenericAttributeIndex(index), swizzle); + } + } + UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast(index)); + break; + } + return "{0, 0, 0, 0}.x"; + } + + if (const auto cbuf = std::get_if(&*node)) { + std::string offset_string; + const auto& offset = cbuf->GetOffset(); + if (const auto imm = std::get_if(&*offset)) { + offset_string = std::to_string(imm->GetValue()); + } else { + offset_string = Visit(offset); + } + const std::string temporary = AllocTemporary(); + AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string); + return temporary; + } + + if (const auto gmem = std::get_if(&*node)) { + const std::string temporary = AllocTemporary(); + AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()), + Visit(gmem->GetBaseAddress())); + AddLine("LDB.U32 {}, {}[{}];", temporary, GlobalMemoryName(gmem->GetDescriptor()), + temporary); + return temporary; + } + + if (const auto lmem = std::get_if(&*node)) { + const std::string temporary = Visit(lmem->GetAddress()); + AddLine("SHR.U {}, {}, 2;", temporary, temporary); + AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary); + return temporary; + } + + if (const auto smem = std::get_if(&*node)) { + const std::string temporary = Visit(smem->GetAddress()); + AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary); + return temporary; + } + + if (const auto internal_flag = std::get_if(&*node)) { + const std::size_t index = static_cast(internal_flag->GetFlag()); + return fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]); + } + + if (const auto conditional = std::get_if(&*node)) { + if (const auto amend_index = conditional->GetAmendIndex()) { + Visit(ir.GetAmendNode(*amend_index)); + } + AddLine("MOVC.U RC.x, {};", Visit(conditional->GetCondition())); + AddLine("IF NE.x;"); + VisitBlock(conditional->GetCode()); + AddLine("ENDIF;"); + return {}; + } + + if (const auto cmt = std::get_if(&*node)) { + // Uncommenting this will generate invalid code. GLASM lacks comments. + // AddLine("// {}", cmt->GetText()); + return {}; + } + + UNIMPLEMENTED(); + return {}; +} + +std::pair ARBDecompiler::BuildCoords(Operation operation) { + const auto& meta = std::get(operation.GetMeta()); + UNIMPLEMENTED_IF(meta.sampler.is_indexed); + UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array && + meta.sampler.type == Tegra::Shader::TextureType::TextureCube); + + const std::size_t count = operation.GetOperandsCount(); + std::string temporary = AllocVectorTemporary(); + std::size_t i = 0; + for (; i < count; ++i) { + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); + } + if (meta.sampler.is_array) { + AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array)); + } + if (meta.sampler.is_shadow) { + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare)); + } + return {std::move(temporary), i}; +} + +std::string ARBDecompiler::BuildAoffi(Operation operation) { + const auto& meta = std::get(operation.GetMeta()); + if (meta.aoffi.empty()) { + return {}; + } + const std::string temporary = AllocVectorTemporary(); + std::size_t i = 0; + for (auto& node : meta.aoffi) { + AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i++), Visit(node)); + } + return fmt::format(", offset({})", temporary); +} + +void ARBDecompiler::Exit() { + if (stage != ShaderType::Fragment) { + AddLine("RET;"); + return; + } + + const auto safe_get_register = [this](u32 reg) -> std::string { + // TODO(Rodrigo): Replace with contains once C++20 releases + const auto& used_registers = ir.GetRegisters(); + if (used_registers.find(reg) != used_registers.end()) { + return fmt::format("R{}.x", reg); + } + return "{0, 0, 0, 0}.x"; + }; + + const auto& header = ir.GetHeader(); + u32 current_reg = 0; + for (u32 rt = 0; rt < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++rt) { + for (u32 component = 0; component < 4; ++component) { + if (!header.ps.IsColorComponentOutputEnabled(rt, component)) { + continue; + } + AddLine("MOV.F result_color{}.{}, {};", rt, Swizzle(component), + safe_get_register(current_reg)); + ++current_reg; + } + } + if (header.ps.omap.depth) { + AddLine("MOV.F result.depth.z, {};", safe_get_register(current_reg + 1)); + } + + AddLine("RET;"); +} + +std::string ARBDecompiler::Assign(Operation operation) { + const Node& dest = operation[0]; + const Node& src = operation[1]; + + std::string dest_name; + if (const auto gpr = std::get_if(&*dest)) { + if (gpr->GetIndex() == Register::ZeroIndex) { + // Writing to Register::ZeroIndex is a no op + return {}; + } + dest_name = fmt::format("R{}.x", gpr->GetIndex()); + } else if (const auto abuf = std::get_if(&*dest)) { + const u32 element = abuf->GetElement(); + const char swizzle = Swizzle(element); + switch (const Attribute::Index index = abuf->GetIndex()) { + case Attribute::Index::Position: + dest_name = fmt::format("result.position.{}", swizzle); + break; + case Attribute::Index::LayerViewportPointSize: + switch (element) { + case 0: + UNIMPLEMENTED(); + return {}; + case 1: + case 2: + if (!device.HasNvViewportArray2()) { + LOG_ERROR( + Render_OpenGL, + "NV_viewport_array2 is missing. Maxwell gen 2 or better is required."); + return {}; + } + dest_name = element == 1 ? "result.layer.x" : "result.viewport.x"; + break; + case 3: + dest_name = "result.pointsize.x"; + break; + } + break; + case Attribute::Index::ClipDistances0123: + dest_name = fmt::format("result.clip[{}].x", element); + break; + case Attribute::Index::ClipDistances4567: + dest_name = fmt::format("result.clip[{}].x", element + 4); + break; + default: + if (!IsGenericAttribute(index)) { + UNREACHABLE(); + return {}; + } + dest_name = + fmt::format("result.attrib[{}].{}", GetGenericAttributeIndex(index), swizzle); + break; + } + } else if (const auto lmem = std::get_if(&*dest)) { + const std::string address = Visit(lmem->GetAddress()); + AddLine("SHR.U {}, {}, 2;", address, address); + dest_name = fmt::format("lmem[{}].x", address); + } else if (const auto smem = std::get_if(&*dest)) { + AddLine("STS.U32 {}, shared_mem[{}];", Visit(src), Visit(smem->GetAddress())); + ResetTemporaries(); + return {}; + } else if (const auto gmem = std::get_if(&*dest)) { + const std::string temporary = AllocTemporary(); + AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()), + Visit(gmem->GetBaseAddress())); + AddLine("STB.U32 {}, {}[{}];", Visit(src), GlobalMemoryName(gmem->GetDescriptor()), + temporary); + ResetTemporaries(); + return {}; + } else { + UNREACHABLE(); + ResetTemporaries(); + return {}; + } + + AddLine("MOV.U {}, {};", dest_name, Visit(src)); + ResetTemporaries(); + return {}; +} + +std::string ARBDecompiler::Select(Operation operation) { + const std::string temporary = AllocTemporary(); + AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]), + Visit(operation[2])); + return temporary; +} + +std::string ARBDecompiler::FClamp(Operation operation) { + // 1.0f in hex, replace with std::bit_cast on C++20 + static constexpr u32 POSITIVE_ONE = 0x3f800000; + + const std::string temporary = AllocTemporary(); + const Node& value = operation[0]; + const Node& low = operation[1]; + const Node& high = operation[2]; + const auto imm_low = std::get_if(&*low); + const auto imm_high = std::get_if(&*high); + if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) { + AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value)); + } else { + AddLine("MIN.F {}, {}, {};", temporary, Visit(value), Visit(high)); + AddLine("MAX.F {}, {}, {};", temporary, temporary, Visit(low)); + } + return temporary; +} + +std::string ARBDecompiler::FCastHalf0(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.x, {};", temporary, Visit(operation[0])); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::FCastHalf1(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.y, {};", temporary, Visit(operation[0])); + AddLine("MOV {}.x, {}.y;", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::FSqrt(Operation operation) { + const std::string temporary = AllocTemporary(); + AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0])); + AddLine("RCP.F32 {}, {};", temporary, temporary); + return temporary; +} + +std::string ARBDecompiler::FSwizzleAdd(Operation operation) { + LOG_WARNING(Render_OpenGL, "(STUBBED)"); + const std::string temporary = AllocTemporary(); + AddLine("ADD.F {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1])); + return temporary; +} + +std::string ARBDecompiler::HAdd2(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1])); + AddLine("ADD.F16 {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HMul2(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1])); + AddLine("MUL.F16 {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HFma2(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + const std::string tmp3 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1])); + AddLine("UP2H.F {}.xy, {};", tmp3, Visit(operation[2])); + AddLine("MAD.F16 {}, {}, {}, {};", tmp1, tmp1, tmp2, tmp3); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HAbsolute(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("PK2H.F {}.x, |{}|;", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HNegate(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("MOVC.S RC.x, {};", Visit(operation[1])); + AddLine("MOV.F {}.x (NE.x), -{}.x;", temporary, temporary); + AddLine("MOVC.S RC.x, {};", Visit(operation[2])); + AddLine("MOV.F {}.y (NE.x), -{}.y;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HClamp(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[1])); + AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2); + AddLine("MAX.F {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[2])); + AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2); + AddLine("MIN.F {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HCastFloat(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.F {}.y, {{0, 0, 0, 0}};", temporary); + AddLine("MOV.F {}.x, {};", temporary, Visit(operation[0])); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HUnpack(Operation operation) { + const std::string operand = Visit(operation[0]); + switch (std::get(operation.GetMeta())) { + case Tegra::Shader::HalfType::H0_H1: + return operand; + case Tegra::Shader::HalfType::F32: { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.U {}.x, {};", temporary, operand); + AddLine("MOV.U {}.y, {}.x;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); + } + case Tegra::Shader::HalfType::H0_H0: { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, operand); + AddLine("MOV.U {}.y, {}.x;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); + } + case Tegra::Shader::HalfType::H1_H1: { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, operand); + AddLine("MOV.U {}.x, {}.y;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); + } + } + UNREACHABLE(); + return "{0, 0, 0, 0}.x"; +} + +std::string ARBDecompiler::HMergeF32(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HMergeH0(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1])); + AddLine("MOV.U {}.x, {}.z;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HMergeH1(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1])); + AddLine("MOV.U {}.y, {}.w;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HPack2(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.U {}.x, {};", temporary, Visit(operation[0])); + AddLine("MOV.U {}.y, {};", temporary, Visit(operation[1])); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::LogicalAssign(Operation operation) { + const Node& dest = operation[0]; + const Node& src = operation[1]; + + std::string target; + + if (const auto pred = std::get_if(&*dest)) { + ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment"); + + const Tegra::Shader::Pred index = pred->GetIndex(); + switch (index) { + case Tegra::Shader::Pred::NeverExecute: + case Tegra::Shader::Pred::UnusedIndex: + // Writing to these predicates is a no-op + return {}; + } + target = fmt::format("P{}.x", static_cast(index)); + } else if (const auto internal_flag = std::get_if(&*dest)) { + const std::size_t index = static_cast(internal_flag->GetFlag()); + target = fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]); + } else { + UNREACHABLE(); + ResetTemporaries(); + return {}; + } + + AddLine("MOV.U {}, {};", target, Visit(src)); + ResetTemporaries(); + return {}; +} + +std::string ARBDecompiler::LogicalPick2(Operation operation) { + const std::string temporary = AllocTemporary(); + const u32 index = std::get(*operation[1]).GetValue(); + AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index)); + return temporary; +} + +std::string ARBDecompiler::LogicalAnd2(Operation operation) { + const std::string temporary = AllocTemporary(); + const std::string op = Visit(operation[0]); + AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op); + return temporary; +} + +std::string ARBDecompiler::FloatOrdered(Operation operation) { + const std::string temporary = AllocTemporary(); + AddLine("MOVC.F32 RC.x, {};", Visit(operation[0])); + AddLine("MOVC.F32 RC.y, {};", Visit(operation[1])); + AddLine("MOV.S {}, -1;", temporary); + AddLine("MOV.S {} (NAN.x), 0;", temporary); + AddLine("MOV.S {} (NAN.y), 0;", temporary); + return temporary; +} + +std::string ARBDecompiler::FloatUnordered(Operation operation) { + const std::string temporary = AllocTemporary(); + AddLine("MOVC.F32 RC.x, {};", Visit(operation[0])); + AddLine("MOVC.F32 RC.y, {};", Visit(operation[1])); + AddLine("MOV.S {}, 0;", temporary); + AddLine("MOV.S {} (NAN.x), -1;", temporary); + AddLine("MOV.S {} (NAN.y), -1;", temporary); + return temporary; +} + +std::string ARBDecompiler::LogicalAddCarry(Operation operation) { + const std::string temporary = AllocTemporary(); + AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1])); + AddLine("MOV.S {}, 0;", temporary); + AddLine("IF CF.x;"); + AddLine("MOV.S {}, -1;", temporary); + AddLine("ENDIF;"); + return temporary; +} + +std::string ARBDecompiler::Texture(Operation operation) { + const auto& meta = std::get(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const auto [temporary, swizzle] = BuildCoords(operation); + + std::string_view opcode = "TEX"; + std::string extra; + if (meta.bias) { + ASSERT(!meta.lod); + opcode = "TXB"; + + if (swizzle < 4) { + AddLine("MOV.F {}.w, {};", temporary, Visit(meta.bias)); + } else { + const std::string bias = AllocTemporary(); + AddLine("MOV.F {}, {};", bias, Visit(meta.bias)); + extra = fmt::format(" {},", bias); + } + } + if (meta.lod) { + ASSERT(!meta.bias); + opcode = "TXL"; + + if (swizzle < 4) { + AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod)); + } else { + const std::string lod = AllocTemporary(); + AddLine("MOV.F {}, {};", lod, Visit(meta.lod)); + extra = fmt::format(" {},", lod); + } + } + + AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id, + TextureType(meta), BuildAoffi(operation)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureGather(Operation operation) { + const auto& meta = std::get(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const auto [temporary, swizzle] = BuildCoords(operation); + + std::string comp; + if (!meta.sampler.is_shadow) { + const auto& immediate = std::get(*meta.component); + comp = fmt::format(".{}", Swizzle(immediate.GetValue())); + } + + AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp, + TextureType(meta), BuildAoffi(operation)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureQueryDimensions(Operation operation) { + const auto& meta = std::get(operation.GetMeta()); + const std::string temporary = AllocVectorTemporary(); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + + ASSERT(!meta.sampler.is_array); + + const std::string lod = operation.GetOperandsCount() > 0 ? Visit(operation[0]) : "0"; + AddLine("TXQ {}, {}, texture[{}], {};", temporary, lod, sampler_id, TextureType(meta)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureQueryLod(Operation operation) { + const auto& meta = std::get(operation.GetMeta()); + const std::string temporary = AllocVectorTemporary(); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + + ASSERT(!meta.sampler.is_array); + + const std::size_t count = operation.GetOperandsCount(); + for (std::size_t i = 0; i < count; ++i) { + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); + } + AddLine("LOD.F {}, {}, texture[{}], {};", temporary, temporary, sampler_id, TextureType(meta)); + AddLine("MUL.F32 {}, {}, {{256, 256, 0, 0}};", temporary, temporary); + AddLine("TRUNC.S {}, {};", temporary, temporary); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TexelFetch(Operation operation) { + const auto& meta = std::get(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const auto [temporary, swizzle] = BuildCoords(operation); + + if (!meta.sampler.is_buffer) { + ASSERT(swizzle < 4); + AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod)); + } + AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta), + BuildAoffi(operation)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureGradient(Operation operation) { + const auto& meta = std::get(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const std::string ddx = AllocVectorTemporary(); + const std::string ddy = AllocVectorTemporary(); + const std::string coord = BuildCoords(operation).first; + + const std::size_t num_components = meta.derivates.size() / 2; + for (std::size_t index = 0; index < num_components; ++index) { + const char swizzle = Swizzle(index); + AddLine("MOV.F {}.{}, {};", ddx, swizzle, Visit(meta.derivates[index * 2])); + AddLine("MOV.F {}.{}, {};", ddy, swizzle, Visit(meta.derivates[index * 2 + 1])); + } + + const std::string_view result = coord; + AddLine("TXD.F {}, {}, {}, {}, texture[{}], {}{};", result, coord, ddx, ddy, sampler_id, + TextureType(meta), BuildAoffi(operation)); + AddLine("MOV.F {}.x, {}.{};", result, result, Swizzle(meta.element)); + return fmt::format("{}.x", result); +} + +std::string ARBDecompiler::ImageLoad(Operation operation) { + const auto& meta = std::get(operation.GetMeta()); + const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index; + const std::size_t count = operation.GetOperandsCount(); + const std::string_view type = ImageType(meta.image.type); + + const std::string temporary = AllocVectorTemporary(); + for (std::size_t i = 0; i < count; ++i) { + AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); + } + AddLine("LOADIM.F {}, {}, image[{}], {};", temporary, temporary, image_id, type); + AddLine("MOV.F {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::ImageStore(Operation operation) { + const auto& meta = std::get(operation.GetMeta()); + const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index; + const std::size_t num_coords = operation.GetOperandsCount(); + const std::size_t num_values = meta.values.size(); + const std::string_view type = ImageType(meta.image.type); + + const std::string coord = AllocVectorTemporary(); + const std::string value = AllocVectorTemporary(); + for (std::size_t i = 0; i < num_coords; ++i) { + AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i])); + } + for (std::size_t i = 0; i < num_values; ++i) { + AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i])); + } + AddLine("STOREIM.F image[{}], {}, {}, {};", image_id, value, coord, type); + return {}; +} + +std::string ARBDecompiler::Branch(Operation operation) { + const auto target = std::get(*operation[0]); + AddLine("MOV.U PC.x, {};", target.GetValue()); + AddLine("CONT;"); + return {}; +} + +std::string ARBDecompiler::BranchIndirect(Operation operation) { + AddLine("MOV.U PC.x, {};", Visit(operation[0])); + AddLine("CONT;"); + return {}; +} + +std::string ARBDecompiler::PushFlowStack(Operation operation) { + const auto stack = std::get(operation.GetMeta()); + const u32 target = std::get(*operation[0]).GetValue(); + const std::string_view stack_name = StackName(stack); + AddLine("MOV.U {}[{}_TOP.x].x, {};", stack_name, stack_name, target); + AddLine("ADD.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name); + return {}; +} + +std::string ARBDecompiler::PopFlowStack(Operation operation) { + const auto stack = std::get(operation.GetMeta()); + const std::string_view stack_name = StackName(stack); + AddLine("SUB.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name); + AddLine("MOV.U PC.x, {}[{}_TOP.x].x;", stack_name, stack_name); + AddLine("CONT;"); + return {}; +} + +std::string ARBDecompiler::Exit(Operation) { + Exit(); + return {}; +} + +std::string ARBDecompiler::Discard(Operation) { + AddLine("KIL TR;"); + return {}; +} + +std::string ARBDecompiler::EmitVertex(Operation) { + AddLine("EMIT;"); + return {}; +} + +std::string ARBDecompiler::EndPrimitive(Operation) { + AddLine("ENDPRIM;"); + return {}; +} + +std::string ARBDecompiler::InvocationId(Operation) { + return "primitive.invocation"; +} + +std::string ARBDecompiler::YNegate(Operation) { + LOG_WARNING(Render_OpenGL, "(STUBBED)"); + const std::string temporary = AllocTemporary(); + AddLine("MOV.F {}, 1;", temporary); + return temporary; +} + +std::string ARBDecompiler::ThreadId(Operation) { + return fmt::format("{}.threadid", StageInputName(stage)); +} + +std::string ARBDecompiler::ShuffleIndexed(Operation operation) { + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "NV_shader_thread_shuffle is missing. Kepler or better is required."); + return Visit(operation[0]); + } + const std::string temporary = AllocVectorTemporary(); + AddLine("SHFIDX.U {}, {}, {}, {{31, 0, 0, 0}};", temporary, Visit(operation[0]), + Visit(operation[1])); + AddLine("MOV.U {}.x, {}.y;", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::Barrier(Operation) { + if (!ir.IsDecompiled()) { + LOG_ERROR(Render_OpenGL, "BAR used but shader is not decompiled"); + return {}; + } + AddLine("BAR;"); + return {}; +} + +std::string ARBDecompiler::MemoryBarrierGroup(Operation) { + AddLine("MEMBAR.CTA;"); + return {}; +} + +std::string ARBDecompiler::MemoryBarrierGlobal(Operation) { + AddLine("MEMBAR;"); + return {}; +} + +} // Anonymous namespace + +std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + const VideoCommon::Shader::Registry& registry, + Tegra::Engines::ShaderType stage, std::string_view identifier) { + return ARBDecompiler(device, ir, registry, stage, identifier).Code(); +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.h b/src/video_core/renderer_opengl/gl_arb_decompiler.h new file mode 100644 index 0000000000..6afc87220e --- /dev/null +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h @@ -0,0 +1,29 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include + +#include "common/common_types.h" + +namespace Tegra::Engines { +enum class ShaderType : u32; +} + +namespace VideoCommon::Shader { +class ShaderIR; +class Registry; +} // namespace VideoCommon::Shader + +namespace OpenGL { + +class Device; + +std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + const VideoCommon::Shader::Registry& registry, + Tegra::Engines::ShaderType stage, std::string_view identifier); + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 890fc6c638..e245e27ecf 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -213,6 +213,7 @@ Device::Device() has_component_indexing_bug = is_amd; has_precise_bug = TestPreciseBug(); has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data; + has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2; use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2; diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 98cca02543..145347943a 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -88,6 +88,10 @@ public: return has_fast_buffer_sub_data; } + bool HasNvViewportArray2() const { + return has_nv_viewport_array2; + } + bool UseAssemblyShaders() const { return use_assembly_shaders; } @@ -111,6 +115,7 @@ private: bool has_component_indexing_bug{}; bool has_precise_bug{}; bool has_fast_buffer_sub_data{}; + bool has_nv_viewport_array2{}; bool use_assembly_shaders{}; }; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index a991ca64a7..f539a05e14 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -20,6 +20,7 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" #include "video_core/memory_manager.h" +#include "video_core/renderer_opengl/gl_arb_decompiler.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" @@ -147,7 +148,8 @@ ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 u auto program = std::make_shared(); if (device.UseAssemblyShaders()) { - const std::string arb = "Not implemented"; + const std::string arb = + DecompileAssemblyShader(device, ir, registry, shader_type, shader_id); GLuint& arb_prog = program->assembly_program.handle; From 87011a97f913939f8d01f9407c0f95971ee362f2 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Tue, 9 Jun 2020 03:03:41 -0300 Subject: [PATCH 060/145] gl_arb_decompiler: Implement FSwizzleAdd --- .../renderer_opengl/gl_arb_decompiler.cpp | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp index 6a23221bbd..1e96b03108 100644 --- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp @@ -786,6 +786,8 @@ ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Reg ShaderType stage, std::string_view identifier) : device{device}, ir{ir}, registry{registry}, stage{stage} { AddLine("TEMP RC;"); + AddLine("TEMP FSWZA[4];"); + AddLine("TEMP FSWZB[4];"); if (ir.IsDecompiled()) { DecompileAST(); } else { @@ -991,6 +993,15 @@ void ARBDecompiler::DeclareInternalFlags() { } void ARBDecompiler::InitializeVariables() { + AddLine("MOV.F32 FSWZA[0], -1;"); + AddLine("MOV.F32 FSWZA[1], 1;"); + AddLine("MOV.F32 FSWZA[2], -1;"); + AddLine("MOV.F32 FSWZA[3], 0;"); + AddLine("MOV.F32 FSWZB[0], -1;"); + AddLine("MOV.F32 FSWZB[1], -1;"); + AddLine("MOV.F32 FSWZB[2], 1;"); + AddLine("MOV.F32 FSWZB[3], -1;"); + if (stage == ShaderType::Vertex || stage == ShaderType::Geometry) { AddLine("MOV.F result.position, {{0, 0, 0, 1}};"); } @@ -1570,10 +1581,22 @@ std::string ARBDecompiler::FSqrt(Operation operation) { } std::string ARBDecompiler::FSwizzleAdd(Operation operation) { - LOG_WARNING(Render_OpenGL, "(STUBBED)"); - const std::string temporary = AllocTemporary(); - AddLine("ADD.F {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1])); - return temporary; + const std::string temporary = AllocVectorTemporary(); + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "NV_shader_thread_shuffle is missing. Kepler or better is required."); + AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1])); + return fmt::format("{}.x", temporary); + } + const std::string lut = AllocVectorTemporary(); + AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage)); + AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary); + AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary); + AddLine("AND.U {}.z, {}.z, 3;", temporary, temporary); + AddLine("MUL.F32 {}.x, {}, FSWZA[{}.z];", temporary, Visit(operation[0]), temporary); + AddLine("MUL.F32 {}.y, {}, FSWZB[{}.z];", temporary, Visit(operation[1]), temporary); + AddLine("ADD.F32 {}.x, {}.x, {}.y;", temporary, temporary, temporary); + return fmt::format("{}.x", temporary); } std::string ARBDecompiler::HAdd2(Operation operation) { From 42250427c5ebd8591aa9fd65a98fd77501cb3008 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Sat, 13 Jun 2020 14:04:28 +1000 Subject: [PATCH 061/145] audren: Implement RendererInfo Fixes ZLA softlock --- src/audio_core/audio_renderer.cpp | 19 ++++++++++++++++--- src/audio_core/audio_renderer.h | 13 ++++++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/audio_core/audio_renderer.cpp b/src/audio_core/audio_renderer.cpp index 50846a8541..d644526171 100644 --- a/src/audio_core/audio_renderer.cpp +++ b/src/audio_core/audio_renderer.cpp @@ -180,11 +180,12 @@ ResultVal> AudioRenderer::UpdateAudioRenderer(const std::vector< // Copy output header UpdateDataHeader response_data{worker_params}; - std::vector output_params(response_data.total_size); if (behavior_info.IsElapsedFrameCountSupported()) { - response_data.frame_count = 0x10; - response_data.total_size += 0x10; + response_data.render_info = sizeof(RendererInfo); + response_data.total_size += sizeof(RendererInfo); } + + std::vector output_params(response_data.total_size); std::memcpy(output_params.data(), &response_data, sizeof(UpdateDataHeader)); // Copy output memory pool entries @@ -219,6 +220,17 @@ ResultVal> AudioRenderer::UpdateAudioRenderer(const std::vector< return Audren::ERR_INVALID_PARAMETERS; } + if (behavior_info.IsElapsedFrameCountSupported()) { + const std::size_t renderer_info_offset{ + sizeof(UpdateDataHeader) + response_data.memory_pools_size + response_data.voices_size + + response_data.effects_size + response_data.sinks_size + + response_data.performance_manager_size + response_data.behavior_size}; + RendererInfo renderer_info{}; + renderer_info.elasped_frame_count = elapsed_frame_count; + std::memcpy(output_params.data() + renderer_info_offset, &renderer_info, + sizeof(RendererInfo)); + } + return MakeResult(output_params); } @@ -447,6 +459,7 @@ void AudioRenderer::QueueMixedBuffer(Buffer::Tag tag) { } } audio_out->QueueBuffer(stream, tag, std::move(buffer)); + elapsed_frame_count++; } void AudioRenderer::ReleaseAndQueueBuffers() { diff --git a/src/audio_core/audio_renderer.h b/src/audio_core/audio_renderer.h index 1f9114c075..f0b691a865 100644 --- a/src/audio_core/audio_renderer.h +++ b/src/audio_core/audio_renderer.h @@ -196,6 +196,12 @@ struct EffectOutStatus { }; static_assert(sizeof(EffectOutStatus) == 0x10, "EffectOutStatus is an invalid size"); +struct RendererInfo { + u64_le elasped_frame_count{}; + INSERT_PADDING_WORDS(2); +}; +static_assert(sizeof(RendererInfo) == 0x10, "RendererInfo is an invalid size"); + struct UpdateDataHeader { UpdateDataHeader() {} @@ -209,7 +215,7 @@ struct UpdateDataHeader { mixes_size = 0x0; sinks_size = config.sink_count * 0x20; performance_manager_size = 0x10; - frame_count = 0; + render_info = 0; total_size = sizeof(UpdateDataHeader) + behavior_size + memory_pools_size + voices_size + effects_size + sinks_size + performance_manager_size; } @@ -223,8 +229,8 @@ struct UpdateDataHeader { u32_le mixes_size{}; u32_le sinks_size{}; u32_le performance_manager_size{}; - INSERT_PADDING_WORDS(1); - u32_le frame_count{}; + u32_le splitter_size{}; + u32_le render_info{}; INSERT_PADDING_WORDS(4); u32_le total_size{}; }; @@ -258,6 +264,7 @@ private: std::unique_ptr audio_out; StreamPtr stream; Core::Memory::Memory& memory; + std::size_t elapsed_frame_count{}; }; } // namespace AudioCore From 6e5d8aac4d8e573dc12a9bc8d93621f17323df5e Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Mon, 15 Jun 2020 05:16:54 -0300 Subject: [PATCH 062/145] video_core/macro_jit_x64: Remove initializer in member variable Fix build time issues on gcc. Confirmed through asan that avoiding this initialization is safe. --- src/video_core/macro/macro_jit_x64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index 21ee157cf4..71f738b9a8 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -85,8 +85,8 @@ private: std::optional next_opcode{}; ProgramType program{nullptr}; - std::array labels{}; - std::array delay_skip{}; + std::array labels; + std::array delay_skip; Xbyak::Label end_of_code{}; bool is_delay_slot{}; From 4417770ba9a1d48ded255e75c32dcc1005b912c1 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 18:59:01 +0100 Subject: [PATCH 063/145] xbyak_abi: Fix ABI_PushRegistersAndAdjustStack Pushing GPRs twice. --- src/common/x64/xbyak_abi.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h index 794da8a52f..d15e1aaf0e 100644 --- a/src/common/x64/xbyak_abi.h +++ b/src/common/x64/xbyak_abi.h @@ -178,21 +178,17 @@ inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::b size_t rsp_alignment, size_t needed_frame_size = 0) { s32 subtraction, xmm_offset; ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset); + for (std::size_t i = 0; i < regs.size(); ++i) { if (regs[i] && ABI_ALL_GPRS[i]) { code.push(IndexToReg64(static_cast(i))); } } + if (subtraction != 0) { code.sub(code.rsp, subtraction); } - for (int i = 0; i < regs.count(); i++) { - if (regs.test(i) & ABI_ALL_GPRS.test(i)) { - code.push(IndexToReg64(i)); - } - } - for (std::size_t i = 0; i < regs.size(); ++i) { if (regs[i] && ABI_ALL_XMMS[i]) { code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(static_cast(i))); From d563017dfe63aaa26e7c08369995838f8b9fdafb Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 18:59:54 +0100 Subject: [PATCH 064/145] xbyak_abi: Remove *GPS variants of stack manipulation functions --- src/common/x64/xbyak_abi.h | 36 -------------------------- src/video_core/macro/macro_jit_x64.cpp | 12 ++++----- 2 files changed, 6 insertions(+), 42 deletions(-) diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h index d15e1aaf0e..1dcd147b6d 100644 --- a/src/common/x64/xbyak_abi.h +++ b/src/common/x64/xbyak_abi.h @@ -223,40 +223,4 @@ inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bits } } -inline size_t ABI_PushRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs, - size_t rsp_alignment, - size_t needed_frame_size = 0) { - s32 subtraction, xmm_offset; - ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset); - - for (std::size_t i = 0; i < regs.size(); ++i) { - if (regs[i] && ABI_ALL_GPRS[i]) { - code.push(IndexToReg64(static_cast(i))); - } - } - - if (subtraction != 0) { - code.sub(code.rsp, subtraction); - } - - return ABI_SHADOW_SPACE; -} - -inline void ABI_PopRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs, - size_t rsp_alignment, size_t needed_frame_size = 0) { - s32 subtraction, xmm_offset; - ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset); - - if (subtraction != 0) { - code.add(code.rsp, subtraction); - } - - // GPRs need to be popped in reverse order - for (int i = 15; i >= 0; i--) { - if (regs[i]) { - code.pop(IndexToReg64(i)); - } - } -} - } // namespace Common::X64 diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 11c1cc3be8..2d82c8cff5 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -302,22 +302,22 @@ void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { sub(result, opcode.immediate * -1); } } - Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); mov(Common::X64::ABI_PARAM1, qword[STATE]); mov(Common::X64::ABI_PARAM2, RESULT); Common::X64::CallFarFunction(*this, &Read); - Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); mov(RESULT, Common::X64::ABI_RETURN.cvt32()); Compile_ProcessResult(opcode.result_operation, opcode.dst); } void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) { - Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); mov(Common::X64::ABI_PARAM1, qword[STATE]); mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS); mov(Common::X64::ABI_PARAM3, value); Common::X64::CallFarFunction(*this, &Send); - Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); Xbyak::Label dont_process{}; // Get increment @@ -421,7 +421,7 @@ void MacroJITx64Impl::Compile() { bool keep_executing = true; labels.fill(Xbyak::Label()); - Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); // JIT state mov(STATE, Common::X64::ABI_PARAM1); mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 + @@ -463,7 +463,7 @@ void MacroJITx64Impl::Compile() { L(end_of_code); - Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); ret(); ready(); program = getCode(); From 36362e9695a988e4727adcfb52f265394d55e8d5 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 19:02:06 +0100 Subject: [PATCH 065/145] xbyak_abi: Register indexes should be unsigned --- src/common/x64/xbyak_abi.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h index 1dcd147b6d..33a96d6cbf 100644 --- a/src/common/x64/xbyak_abi.h +++ b/src/common/x64/xbyak_abi.h @@ -11,7 +11,7 @@ namespace Common::X64 { -inline int RegToIndex(const Xbyak::Reg& reg) { +inline std::size_t RegToIndex(const Xbyak::Reg& reg) { using Kind = Xbyak::Reg::Kind; ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0, "RegSet only support GPRs and XMM registers."); @@ -19,17 +19,17 @@ inline int RegToIndex(const Xbyak::Reg& reg) { return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16); } -inline Xbyak::Reg64 IndexToReg64(int reg_index) { +inline Xbyak::Reg64 IndexToReg64(std::size_t reg_index) { ASSERT(reg_index < 16); - return Xbyak::Reg64(reg_index); + return Xbyak::Reg64(static_cast(reg_index)); } -inline Xbyak::Xmm IndexToXmm(int reg_index) { +inline Xbyak::Xmm IndexToXmm(std::size_t reg_index) { ASSERT(reg_index >= 16 && reg_index < 32); - return Xbyak::Xmm(reg_index - 16); + return Xbyak::Xmm(static_cast(reg_index - 16)); } -inline Xbyak::Reg IndexToReg(int reg_index) { +inline Xbyak::Reg IndexToReg(std::size_t reg_index) { if (reg_index < 16) { return IndexToReg64(reg_index); } else { @@ -181,7 +181,7 @@ inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::b for (std::size_t i = 0; i < regs.size(); ++i) { if (regs[i] && ABI_ALL_GPRS[i]) { - code.push(IndexToReg64(static_cast(i))); + code.push(IndexToReg64(i)); } } @@ -191,7 +191,7 @@ inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::b for (std::size_t i = 0; i < regs.size(); ++i) { if (regs[i] && ABI_ALL_XMMS[i]) { - code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(static_cast(i))); + code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(i)); xmm_offset += 0x10; } } @@ -206,7 +206,7 @@ inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bits for (std::size_t i = 0; i < regs.size(); ++i) { if (regs[i] && ABI_ALL_XMMS[i]) { - code.movaps(IndexToXmm(static_cast(i)), code.xword[code.rsp + xmm_offset]); + code.movaps(IndexToXmm(i), code.xword[code.rsp + xmm_offset]); xmm_offset += 0x10; } } @@ -216,8 +216,9 @@ inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bits } // GPRs need to be popped in reverse order - for (int i = 15; i >= 0; i--) { - if (regs[i]) { + for (std::size_t j = 0; j < regs.size(); ++j) { + const std::size_t i = regs.size() - j - 1; + if (regs[i] && ABI_ALL_GPRS[i]) { code.pop(IndexToReg64(i)); } } From 7c6203dc5ed9a8b5ae9154602d27717569851571 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 19:04:29 +0100 Subject: [PATCH 066/145] xbyak_abi: Prefer returning a struct to using out parameters in ABI_CalculateFrameSize --- src/common/x64/xbyak_abi.h | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h index 33a96d6cbf..a5f5d4fc10 100644 --- a/src/common/x64/xbyak_abi.h +++ b/src/common/x64/xbyak_abi.h @@ -151,9 +151,13 @@ constexpr size_t ABI_SHADOW_SPACE = 0; #endif -inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment, - size_t needed_frame_size, s32* out_subtraction, - s32* out_xmm_offset) { +struct ABIFrameInfo { + s32 subtraction; + s32 xmm_offset; +}; + +inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment, + size_t needed_frame_size) { const auto count = (regs & ABI_ALL_GPRS).count(); rsp_alignment -= count * 8; size_t subtraction = 0; @@ -170,14 +174,13 @@ inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment, rsp_alignment -= subtraction; subtraction += rsp_alignment & 0xF; - *out_subtraction = (s32)subtraction; - *out_xmm_offset = (s32)(subtraction - xmm_base_subtraction); + return ABIFrameInfo{static_cast(subtraction), + static_cast(subtraction - xmm_base_subtraction)}; } inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs, size_t rsp_alignment, size_t needed_frame_size = 0) { - s32 subtraction, xmm_offset; - ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset); + auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size); for (std::size_t i = 0; i < regs.size(); ++i) { if (regs[i] && ABI_ALL_GPRS[i]) { @@ -185,14 +188,14 @@ inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::b } } - if (subtraction != 0) { - code.sub(code.rsp, subtraction); + if (frame_info.subtraction != 0) { + code.sub(code.rsp, frame_info.subtraction); } for (std::size_t i = 0; i < regs.size(); ++i) { if (regs[i] && ABI_ALL_XMMS[i]) { - code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(i)); - xmm_offset += 0x10; + code.movaps(code.xword[code.rsp + frame_info.xmm_offset], IndexToXmm(i)); + frame_info.xmm_offset += 0x10; } } @@ -201,18 +204,17 @@ inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::b inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs, size_t rsp_alignment, size_t needed_frame_size = 0) { - s32 subtraction, xmm_offset; - ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset); + auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size); for (std::size_t i = 0; i < regs.size(); ++i) { if (regs[i] && ABI_ALL_XMMS[i]) { - code.movaps(IndexToXmm(i), code.xword[code.rsp + xmm_offset]); - xmm_offset += 0x10; + code.movaps(IndexToXmm(i), code.xword[code.rsp + frame_info.xmm_offset]); + frame_info.xmm_offset += 0x10; } } - if (subtraction != 0) { - code.add(code.rsp, subtraction); + if (frame_info.subtraction != 0) { + code.add(code.rsp, frame_info.subtraction); } // GPRs need to be popped in reverse order From a6a43a5ae047404ca0b03aa647ed5b17400ca7b6 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 20:28:30 +0100 Subject: [PATCH 067/145] macro_jit_x64: Remove RESULT_64 This Reg64 codepath has the exact same behaviour as the Reg32 one. --- src/video_core/macro/macro_jit_x64.cpp | 18 +++--------------- src/video_core/macro/macro_jit_x64.h | 1 - 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 11c1cc3be8..9a9d508664 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -19,7 +19,6 @@ static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10; static const Xbyak::Reg64 STATE = Xbyak::util::r11; static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12; static const Xbyak::Reg32 RESULT = Xbyak::util::r13d; -static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13; static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14; static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; @@ -64,15 +63,15 @@ void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { const bool is_move_operation = !is_a_zero && is_b_zero; const bool has_zero_register = is_a_zero || is_b_zero; - Xbyak::Reg64 src_a; + Xbyak::Reg32 src_a; Xbyak::Reg32 src_b; if (!optimizer.zero_reg_skip) { - src_a = Compile_GetRegister(opcode.src_a, RESULT_64); + src_a = Compile_GetRegister(opcode.src_a, RESULT); src_b = Compile_GetRegister(opcode.src_b, ebx); } else { if (!is_a_zero) { - src_a = Compile_GetRegister(opcode.src_a, RESULT_64); + src_a = Compile_GetRegister(opcode.src_a, RESULT); } if (!is_b_zero) { src_b = Compile_GetRegister(opcode.src_b, ebx); @@ -553,17 +552,6 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { return dst; } -Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) { - if (index == 0) { - // Register 0 is always zero - xor_(dst, dst); - } else { - mov(dst, dword[REGISTERS + index * sizeof(u32)]); - } - - return dst; -} - void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) { Xbyak::Label zero{}, end{}; xor_(ecx, ecx); diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index 21ee157cf4..377368086e 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -55,7 +55,6 @@ private: Xbyak::Reg32 Compile_FetchParameter(); Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst); - Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst); void Compile_WriteCarry(Xbyak::Reg64 dst); void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg); From 389549b80d7cd7054ec622f4038ff599386e1c04 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 20:51:33 +0100 Subject: [PATCH 068/145] macro_jit_x64: Remove METHOD_ADDRESS_64 Unnecessary variable. --- src/video_core/macro/macro_jit_x64.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 9a9d508664..1dcf9957ce 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -20,7 +20,6 @@ static const Xbyak::Reg64 STATE = Xbyak::util::r11; static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12; static const Xbyak::Reg32 RESULT = Xbyak::util::r13d; static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; -static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14; static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ @@ -328,7 +327,7 @@ void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) { and_(METHOD_ADDRESS, 0xfff); shr(ecx, 12); and_(ecx, 0x3f); - lea(eax, ptr[rcx + METHOD_ADDRESS_64]); + lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]); sal(ecx, 12); or_(eax, ecx); From 35db6e1c68f18f401bcae8bd8e8937648c7c67c6 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 20:55:02 +0100 Subject: [PATCH 069/145] macro_jit_x64: Remove JITState::parameters This can be passed in as an argument instead. --- src/video_core/macro/macro_jit_x64.cpp | 6 ++---- src/video_core/macro/macro_jit_x64.h | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 1dcf9957ce..f1d123f51b 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -51,8 +51,7 @@ void MacroJITx64Impl::Execute(const std::vector& parameters, u32 method) { JITState state{}; state.maxwell3d = &maxwell3d; state.registers = {}; - state.parameters = parameters.data(); - program(&state); + program(&state, parameters.data()); } void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { @@ -422,8 +421,7 @@ void MacroJITx64Impl::Compile() { Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); // JIT state mov(STATE, Common::X64::ABI_PARAM1); - mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 + - static_cast(offsetof(JITState, parameters))]); + mov(PARAMETERS, Common::X64::ABI_PARAM2); mov(REGISTERS, Common::X64::ABI_PARAM1); add(REGISTERS, static_cast(offsetof(JITState, registers))); xor_(RESULT, RESULT); diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index 377368086e..9167b2a93a 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -66,11 +66,10 @@ private: struct JITState { Engines::Maxwell3D* maxwell3d{}; std::array registers{}; - const u32* parameters{}; u32 carry_flag{}; }; static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0"); - using ProgramType = void (*)(JITState*); + using ProgramType = void (*)(JITState*, const u32*); struct OptimizerState { bool can_skip_carry{}; From 79aa7b3aceeecadfb5b15bc25431db7768434f23 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 21:00:59 +0100 Subject: [PATCH 070/145] macro_jit_x64: Remove REGISTERS Unnecessary since this is just an offset from STATE. --- src/video_core/macro/macro_jit_x64.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index f1d123f51b..da3b86d3d2 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -15,7 +15,6 @@ MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255 namespace Tegra { static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9; -static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10; static const Xbyak::Reg64 STATE = Xbyak::util::r11; static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12; static const Xbyak::Reg32 RESULT = Xbyak::util::r13d; @@ -24,7 +23,6 @@ static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ PARAMETERS, - REGISTERS, STATE, NEXT_PARAMETER, RESULT, @@ -422,14 +420,12 @@ void MacroJITx64Impl::Compile() { // JIT state mov(STATE, Common::X64::ABI_PARAM1); mov(PARAMETERS, Common::X64::ABI_PARAM2); - mov(REGISTERS, Common::X64::ABI_PARAM1); - add(REGISTERS, static_cast(offsetof(JITState, registers))); xor_(RESULT, RESULT); xor_(METHOD_ADDRESS, METHOD_ADDRESS); xor_(NEXT_PARAMETER, NEXT_PARAMETER); xor_(BRANCH_HOLDER, BRANCH_HOLDER); - mov(dword[REGISTERS + 4], Compile_FetchParameter()); + mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter()); // Track get register for zero registers and mark it as no-op optimizer.zero_reg_skip = true; @@ -543,7 +539,7 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { // Register 0 is always zero xor_(dst, dst); } else { - mov(dst, dword[REGISTERS + index * sizeof(u32)]); + mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]); } return dst; @@ -564,7 +560,7 @@ void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u3 if (reg == 0) { return; } - mov(dword[REGISTERS + reg * sizeof(u32)], result); + mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result); }; auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); }; From c09a9e5cc7f53280218cdfbfd7d7ff056f1c2ff5 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 21:12:53 +0100 Subject: [PATCH 071/145] macro_jit_x64: Select better registers All registers are now callee-save registers. RBX and RBP selected for STATE and RESULT because these are most commonly accessed; this is to avoid the REX prefix. RBP not used for STATE because there are some SIB restrictions, RBX emits smaller code. --- src/video_core/macro/macro_jit_x64.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index da3b86d3d2..1e7b05ac95 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -14,18 +14,18 @@ MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255 MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0)); namespace Tegra { -static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9; -static const Xbyak::Reg64 STATE = Xbyak::util::r11; -static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12; -static const Xbyak::Reg32 RESULT = Xbyak::util::r13d; +static const Xbyak::Reg64 STATE = Xbyak::util::rbx; +static const Xbyak::Reg32 RESULT = Xbyak::util::ebp; +static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r12; +static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r13; static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ - PARAMETERS, STATE, - NEXT_PARAMETER, RESULT, + PARAMETERS, + NEXT_PARAMETER, METHOD_ADDRESS, BRANCH_HOLDER, }); @@ -64,13 +64,13 @@ void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { if (!optimizer.zero_reg_skip) { src_a = Compile_GetRegister(opcode.src_a, RESULT); - src_b = Compile_GetRegister(opcode.src_b, ebx); + src_b = Compile_GetRegister(opcode.src_b, eax); } else { if (!is_a_zero) { src_a = Compile_GetRegister(opcode.src_a, RESULT); } if (!is_b_zero) { - src_b = Compile_GetRegister(opcode.src_b, ebx); + src_b = Compile_GetRegister(opcode.src_b, eax); } } Xbyak::Label skip_carry{}; From 1799f4e7743557c8e41c15201c42431f8d6d6dde Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 21:14:10 +0100 Subject: [PATCH 072/145] macro_jit_x64: Remove unused function Compile_WriteCarry --- src/video_core/macro/macro_jit_x64.cpp | 8 -------- src/video_core/macro/macro_jit_x64.h | 1 - 2 files changed, 9 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 1e7b05ac95..b703daad9e 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -545,14 +545,6 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { return dst; } -void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) { - Xbyak::Label zero{}, end{}; - xor_(ecx, ecx); - shr(dst, 32); - setne(cl); - mov(dword[STATE + offsetof(JITState, carry_flag)], ecx); -} - void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) { // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index 9167b2a93a..a05d8df15a 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -55,7 +55,6 @@ private: Xbyak::Reg32 Compile_FetchParameter(); Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst); - void Compile_WriteCarry(Xbyak::Reg64 dst); void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg); void Compile_Send(Xbyak::Reg32 value); From cf0aad7d6a22024362c7adf04b605108141453f6 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 21:16:47 +0100 Subject: [PATCH 073/145] macro_jit_x64: Remove NEXT_PARAMETER Not required, as PARAMETERS can just be incremented directly. --- src/video_core/macro/macro_jit_x64.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index b703daad9e..2eb98173df 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -17,7 +17,6 @@ namespace Tegra { static const Xbyak::Reg64 STATE = Xbyak::util::rbx; static const Xbyak::Reg32 RESULT = Xbyak::util::ebp; static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r12; -static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r13; static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; @@ -25,7 +24,6 @@ static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ STATE, RESULT, PARAMETERS, - NEXT_PARAMETER, METHOD_ADDRESS, BRANCH_HOLDER, }); @@ -422,7 +420,6 @@ void MacroJITx64Impl::Compile() { mov(PARAMETERS, Common::X64::ABI_PARAM2); xor_(RESULT, RESULT); xor_(METHOD_ADDRESS, METHOD_ADDRESS); - xor_(NEXT_PARAMETER, NEXT_PARAMETER); xor_(BRANCH_HOLDER, BRANCH_HOLDER); mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter()); @@ -529,8 +526,8 @@ bool MacroJITx64Impl::Compile_NextInstruction() { } Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() { - mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]); - inc(NEXT_PARAMETER); + mov(eax, dword[PARAMETERS]); + add(PARAMETERS, sizeof(u32)); return eax; } From 256cb2979b2976017c94a4c2a42cae155bf604cd Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 20:10:39 +0100 Subject: [PATCH 074/145] CMakeLists: xbyak comes before dynarmic --- externals/CMakeLists.txt | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index df7a5e0a94..9be5b27800 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -4,6 +4,13 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMakeModules") list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/externals/find-modules") include(DownloadExternals) +# xbyak +if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64) + add_library(xbyak INTERFACE) + target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak) + target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES) +endif() + # Catch add_library(catch-single-include INTERFACE) target_include_directories(catch-single-include INTERFACE catch/single_include) @@ -75,11 +82,3 @@ if (ENABLE_WEB_SERVICE) target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT) target_link_libraries(httplib INTERFACE OpenSSL::SSL OpenSSL::Crypto) endif() - -if (NOT TARGET xbyak) - if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64) - add_library(xbyak INTERFACE) - target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak) - target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES) - endif() -endif() From e2f5d165407ebb998cc24005bc30476ba29d5bcd Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Tue, 16 Jun 2020 01:30:01 -0400 Subject: [PATCH 075/145] gl_device: Reserve at least 4 image bindings for fragment stage Due to the limitation of GL_MAX_IMAGE_UNITS being low (8) on Intel's and Nvidia's proprietary drivers, we have to reserve an appropriate amount of image bindings for each of the stages. So far games have been observed to use 4 image bindings on the fragment stage (Kirby Star Allies) and 1 on the vertex stage (TWD series). No games thus far in my limited testing used more than 4 images concurrently and across all currently active programs. This fixes shader compilation errors on Kirby Star Allies on OpenGL (GLSL/GLASM) --- src/video_core/renderer_opengl/gl_device.cpp | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 890fc6c638..21a4f4def7 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -123,16 +123,24 @@ std::array BuildBaseBindin u32 num_images = GetInteger(GL_MAX_IMAGE_UNITS); u32 base_images = 0; - // Reserve more image bindings on fragment and vertex stages. + // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8. + // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the + // fragment stage, and at least 1 for the rest of the stages. + // So far games are observed to use 1 image binding on vertex and 4 on fragment stages. + + // Reserve at least 4 image bindings on the fragment stage. bindings[4].image = - Extract(base_images, num_images, num_images / NumStages + 2, LimitImages[4]); - bindings[0].image = - Extract(base_images, num_images, num_images / NumStages + 1, LimitImages[0]); + Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]); + + // This is guaranteed to be at least 1. + const u32 total_extracted_images = num_images / (NumStages - 1); // Reserve the other image bindings. - const u32 total_extracted_images = num_images / (NumStages - 2); - for (std::size_t i = 2; i < NumStages; ++i) { + for (std::size_t i = 0; i < NumStages; ++i) { const std::size_t stage = stage_swizzle[i]; + if (stage == 4) { + continue; + } bindings[stage].image = Extract(base_images, num_images, total_extracted_images, LimitImages[stage]); } From 2a3d4cad63f80151893ee49ee205c5f72aefd321 Mon Sep 17 00:00:00 2001 From: bunnei Date: Tue, 16 Jun 2020 21:46:19 -0400 Subject: [PATCH 076/145] externals: Revert to libressl, as build is broken with find_package(OpenSSL). (#4093) * externals: Revert to libressl, as build is broken with find_package(OpenSLL). * fixup! externals: Revert to libressl, as build is broken with find_package(OpenSLL). * fixup! externals: Revert to libressl, as build is broken with find_package(OpenSLL). --- .gitmodules | 3 +++ CMakeLists.txt | 10 ---------- externals/CMakeLists.txt | 11 ++++++++++- externals/libressl | 1 + 4 files changed, 14 insertions(+), 11 deletions(-) create mode 160000 externals/libressl diff --git a/.gitmodules b/.gitmodules index 2ec9dda62a..9ba8fe2072 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,6 +13,9 @@ [submodule "soundtouch"] path = externals/soundtouch url = https://github.com/citra-emu/ext-soundtouch.git +[submodule "libressl"] + path = externals/libressl + url = https://github.com/citra-emu/ext-libressl-portable.git [submodule "discord-rpc"] path = externals/discord-rpc url = https://github.com/discordapp/discord-rpc.git diff --git a/CMakeLists.txt b/CMakeLists.txt index a9f669d568..b710712711 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -152,7 +152,6 @@ macro(yuzu_find_packages) "Boost 1.71 boost/1.72.0" "Catch2 2.11 catch2/2.11.0" "fmt 6.2 fmt/6.2.0" - "OpenSSL 1.1 openssl/1.1.1f" # can't use until https://github.com/bincrafters/community/issues/1173 #"libzip 1.5 libzip/1.5.2@bincrafters/stable" "lz4 1.8 lz4/1.9.2" @@ -312,15 +311,6 @@ elseif (TARGET Boost::boost) add_library(boost ALIAS Boost::boost) endif() -if (NOT TARGET OpenSSL::SSL) - set_target_properties(OpenSSL::OpenSSL PROPERTIES IMPORTED_GLOBAL TRUE) - add_library(OpenSSL::SSL ALIAS OpenSSL::OpenSSL) -endif() -if (NOT TARGET OpenSSL::Crypto) - set_target_properties(OpenSSL::OpenSSL PROPERTIES IMPORTED_GLOBAL TRUE) - add_library(OpenSSL::Crypto ALIAS OpenSSL::OpenSSL) -endif() - if (TARGET sdl2::sdl2) # imported from the conan generated sdl2Config.cmake set_target_properties(sdl2::sdl2 PROPERTIES IMPORTED_GLOBAL TRUE) diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 9be5b27800..b80b276057 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -73,6 +73,15 @@ if (NOT LIBZIP_FOUND) endif() if (ENABLE_WEB_SERVICE) + # LibreSSL + set(LIBRESSL_SKIP_INSTALL ON CACHE BOOL "") + add_subdirectory(libressl EXCLUDE_FROM_ALL) + target_include_directories(ssl INTERFACE ./libressl/include) + target_compile_definitions(ssl PRIVATE -DHAVE_INET_NTOP) + get_directory_property(OPENSSL_LIBRARIES + DIRECTORY libressl + DEFINITION OPENSSL_LIBS) + # lurlparser add_subdirectory(lurlparser EXCLUDE_FROM_ALL) @@ -80,5 +89,5 @@ if (ENABLE_WEB_SERVICE) add_library(httplib INTERFACE) target_include_directories(httplib INTERFACE ./httplib) target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT) - target_link_libraries(httplib INTERFACE OpenSSL::SSL OpenSSL::Crypto) + target_link_libraries(httplib INTERFACE ${OPENSSL_LIBRARIES}) endif() diff --git a/externals/libressl b/externals/libressl new file mode 160000 index 0000000000..7d01cb01cb --- /dev/null +++ b/externals/libressl @@ -0,0 +1 @@ +Subproject commit 7d01cb01cb1a926ecb4c9c98b107ef3c26f59dfb From a6ddd7c382e0362d5e86c1622c85c78c59c5aa3b Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 22:01:00 +0100 Subject: [PATCH 077/145] macro_jit_x64: Should not skip zero registers for certain ALU ops The code generated for these ALU ops assume src_a and src_b are always valid. --- src/video_core/macro/macro_jit_x64.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 2eb98173df..08279b9bcf 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -56,11 +56,13 @@ void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { const bool valid_operation = !is_a_zero && !is_b_zero; const bool is_move_operation = !is_a_zero && is_b_zero; const bool has_zero_register = is_a_zero || is_b_zero; + const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry || + opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow; Xbyak::Reg32 src_a; Xbyak::Reg32 src_b; - if (!optimizer.zero_reg_skip) { + if (!optimizer.zero_reg_skip || no_zero_reg_skip) { src_a = Compile_GetRegister(opcode.src_a, RESULT); src_b = Compile_GetRegister(opcode.src_b, eax); } else { From c409722435bdb1f2eae4d192c89278e3b07fd2ed Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 22:01:25 +0100 Subject: [PATCH 078/145] macro_jit_x64: Optimization implicitly assumes same destination --- src/video_core/macro/macro_jit_x64.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 08279b9bcf..30a7e1fe97 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -185,7 +185,8 @@ void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) { opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) { if (next_opcode.has_value()) { const auto next = *next_opcode; - if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) { + if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod && + opcode.dst == next.dst) { return; } } From 32a127faaaba6f046e4b4152a2bf300a5c18dedf Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 17 Jun 2020 15:40:15 +0100 Subject: [PATCH 079/145] arm_dynarmic_32: InterpreterFallback should never happen --- src/core/arm/dynarmic/arm_dynarmic_32.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp index 9bc86e3b9a..4c8ab46747 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp @@ -50,7 +50,8 @@ public: } void InterpreterFallback(u32 pc, std::size_t num_instructions) override { - UNIMPLEMENTED(); + UNIMPLEMENTED_MSG("This should never happen, pc = {:08X}, code = {:08X}", pc, + MemoryReadCode(pc)); } void ExceptionRaised(u32 pc, Dynarmic::A32::Exception exception) override { @@ -112,7 +113,7 @@ void ARM_Dynarmic_32::Run() { } void ARM_Dynarmic_32::Step() { - cb->InterpreterFallback(jit->Regs()[15], 1); + jit->Step(); } ARM_Dynarmic_32::ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor, From 109df7705f01b7f513ac08f0076266b2c529c373 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 17 Jun 2020 16:32:08 +0100 Subject: [PATCH 080/145] arm_dynarmic_cp15: Update CP15 --- src/core/arm/dynarmic/arm_dynarmic_32.cpp | 16 ++- src/core/arm/dynarmic/arm_dynarmic_32.h | 5 +- src/core/arm/dynarmic/arm_dynarmic_cp15.cpp | 68 ++++++++--- src/core/arm/dynarmic/arm_dynarmic_cp15.h | 126 ++------------------ 4 files changed, 73 insertions(+), 142 deletions(-) diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp index 4c8ab46747..e7456a8c34 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp @@ -90,8 +90,6 @@ public: ARM_Dynarmic_32& parent; std::size_t num_interpreted_instructions{}; - u64 tpidrro_el0{}; - u64 tpidr_el0{}; }; std::shared_ptr ARM_Dynarmic_32::MakeJit(Common::PageTable& page_table, @@ -100,7 +98,7 @@ std::shared_ptr ARM_Dynarmic_32::MakeJit(Common::PageTable& config.callbacks = cb.get(); // TODO(bunnei): Implement page table for 32-bit // config.page_table = &page_table.pointers; - config.coprocessors[15] = std::make_shared((u32*)&CP15_regs[0]); + config.coprocessors[15] = cp15; config.define_unpredictable_behaviour = true; return std::make_unique(config); } @@ -118,8 +116,8 @@ void ARM_Dynarmic_32::Step() { ARM_Dynarmic_32::ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index) - : ARM_Interface{system}, - cb(std::make_unique(*this)), core_index{core_index}, + : ARM_Interface{system}, cb(std::make_unique(*this)), + cp15(std::make_shared(*this)), core_index{core_index}, exclusive_monitor{dynamic_cast(exclusive_monitor)} {} ARM_Dynarmic_32::~ARM_Dynarmic_32() = default; @@ -155,19 +153,19 @@ void ARM_Dynarmic_32::SetPSTATE(u32 cpsr) { } u64 ARM_Dynarmic_32::GetTlsAddress() const { - return CP15_regs[static_cast(CP15Register::CP15_THREAD_URO)]; + return cp15->uro; } void ARM_Dynarmic_32::SetTlsAddress(VAddr address) { - CP15_regs[static_cast(CP15Register::CP15_THREAD_URO)] = static_cast(address); + cp15->uro = static_cast(address); } u64 ARM_Dynarmic_32::GetTPIDR_EL0() const { - return cb->tpidr_el0; + return cp15->uprw; } void ARM_Dynarmic_32::SetTPIDR_EL0(u64 value) { - cb->tpidr_el0 = value; + cp15->uprw = value; } void ARM_Dynarmic_32::SaveContext(ThreadContext32& ctx) { diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.h b/src/core/arm/dynarmic/arm_dynarmic_32.h index 8ba9cea8f1..e5b92d7bb0 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_32.h +++ b/src/core/arm/dynarmic/arm_dynarmic_32.h @@ -22,6 +22,7 @@ class Memory; namespace Core { class DynarmicCallbacks32; +class DynarmicCP15; class DynarmicExclusiveMonitor; class System; @@ -66,12 +67,14 @@ private: std::unordered_map, Common::PairHash>; friend class DynarmicCallbacks32; + friend class DynarmicCP15; + std::unique_ptr cb; JitCacheType jit_cache; std::shared_ptr jit; + std::shared_ptr cp15; std::size_t core_index; DynarmicExclusiveMonitor& exclusive_monitor; - std::array CP15_regs{}; }; } // namespace Core diff --git a/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp b/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp index 3fdcdebdee..81b2c400f8 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp @@ -2,79 +2,119 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include +#include "common/logging/log.h" +#include "core/arm/dynarmic/arm_dynarmic_32.h" #include "core/arm/dynarmic/arm_dynarmic_cp15.h" using Callback = Dynarmic::A32::Coprocessor::Callback; using CallbackOrAccessOneWord = Dynarmic::A32::Coprocessor::CallbackOrAccessOneWord; using CallbackOrAccessTwoWords = Dynarmic::A32::Coprocessor::CallbackOrAccessTwoWords; +template <> +struct fmt::formatter { + constexpr auto parse(format_parse_context& ctx) { + return ctx.begin(); + } + template + auto format(const Dynarmic::A32::CoprocReg& reg, FormatContext& ctx) { + return format_to(ctx.out(), "cp{}", static_cast(reg)); + } +}; + +namespace Core { + +static u32 dummy_value; + std::optional DynarmicCP15::CompileInternalOperation(bool two, unsigned opc1, CoprocReg CRd, CoprocReg CRn, CoprocReg CRm, unsigned opc2) { + LOG_CRITICAL(Core_ARM, "CP15: cdp{} p15, {}, {}, {}, {}, {}", two ? "2" : "", opc1, CRd, CRn, + CRm, opc2); return {}; } CallbackOrAccessOneWord DynarmicCP15::CompileSendOneWord(bool two, unsigned opc1, CoprocReg CRn, CoprocReg CRm, unsigned opc2) { - // TODO(merry): Privileged CP15 registers - if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C5 && opc2 == 4) { + // CP15_FLUSH_PREFETCH_BUFFER // This is a dummy write, we ignore the value written here. - return &CP15[static_cast(CP15Register::CP15_FLUSH_PREFETCH_BUFFER)]; + return &dummy_value; } if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C10) { switch (opc2) { case 4: + // CP15_DATA_SYNC_BARRIER // This is a dummy write, we ignore the value written here. - return &CP15[static_cast(CP15Register::CP15_DATA_SYNC_BARRIER)]; + return &dummy_value; case 5: + // CP15_DATA_MEMORY_BARRIER // This is a dummy write, we ignore the value written here. - return &CP15[static_cast(CP15Register::CP15_DATA_MEMORY_BARRIER)]; - default: - return {}; + return &dummy_value; } } if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0 && opc2 == 2) { - return &CP15[static_cast(CP15Register::CP15_THREAD_UPRW)]; + // CP15_THREAD_UPRW + return &uprw; } + LOG_CRITICAL(Core_ARM, "CP15: mcr{} p15, {}, , {}, {}, {}", two ? "2" : "", opc1, CRn, CRm, + opc2); return {}; } CallbackOrAccessTwoWords DynarmicCP15::CompileSendTwoWords(bool two, unsigned opc, CoprocReg CRm) { + LOG_CRITICAL(Core_ARM, "CP15: mcrr{} p15, {}, , , {}", two ? "2" : "", opc, CRm); return {}; } CallbackOrAccessOneWord DynarmicCP15::CompileGetOneWord(bool two, unsigned opc1, CoprocReg CRn, CoprocReg CRm, unsigned opc2) { - // TODO(merry): Privileged CP15 registers - if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0) { switch (opc2) { case 2: - return &CP15[static_cast(CP15Register::CP15_THREAD_UPRW)]; + // CP15_THREAD_UPRW + return &uprw; case 3: - return &CP15[static_cast(CP15Register::CP15_THREAD_URO)]; - default: - return {}; + // CP15_THREAD_URO + return &uro; } } + LOG_CRITICAL(Core_ARM, "CP15: mrc{} p15, {}, , {}, {}, {}", two ? "2" : "", opc1, CRn, CRm, + opc2); return {}; } CallbackOrAccessTwoWords DynarmicCP15::CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) { + LOG_CRITICAL(Core_ARM, "CP15: mrrc{} p15, {}, , , {}", two ? "2" : "", opc, CRm); return {}; } std::optional DynarmicCP15::CompileLoadWords(bool two, bool long_transfer, CoprocReg CRd, std::optional option) { + if (option) { + LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...], {}", two ? "2" : "", + long_transfer ? "l" : "", CRd, *option); + } else { + LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...]", two ? "2" : "", + long_transfer ? "l" : "", CRd); + } return {}; } std::optional DynarmicCP15::CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd, std::optional option) { + if (option) { + LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...], {}", two ? "2" : "", + long_transfer ? "l" : "", CRd, *option); + } else { + LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...]", two ? "2" : "", + long_transfer ? "l" : "", CRd); + } return {}; } + +} // namespace Core diff --git a/src/core/arm/dynarmic/arm_dynarmic_cp15.h b/src/core/arm/dynarmic/arm_dynarmic_cp15.h index 07bcde5f9d..7356d252e6 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_cp15.h +++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.h @@ -10,128 +10,15 @@ #include #include "common/common_types.h" -enum class CP15Register { - // c0 - Information registers - CP15_MAIN_ID, - CP15_CACHE_TYPE, - CP15_TCM_STATUS, - CP15_TLB_TYPE, - CP15_CPU_ID, - CP15_PROCESSOR_FEATURE_0, - CP15_PROCESSOR_FEATURE_1, - CP15_DEBUG_FEATURE_0, - CP15_AUXILIARY_FEATURE_0, - CP15_MEMORY_MODEL_FEATURE_0, - CP15_MEMORY_MODEL_FEATURE_1, - CP15_MEMORY_MODEL_FEATURE_2, - CP15_MEMORY_MODEL_FEATURE_3, - CP15_ISA_FEATURE_0, - CP15_ISA_FEATURE_1, - CP15_ISA_FEATURE_2, - CP15_ISA_FEATURE_3, - CP15_ISA_FEATURE_4, +namespace Core { - // c1 - Control registers - CP15_CONTROL, - CP15_AUXILIARY_CONTROL, - CP15_COPROCESSOR_ACCESS_CONTROL, - - // c2 - Translation table registers - CP15_TRANSLATION_BASE_TABLE_0, - CP15_TRANSLATION_BASE_TABLE_1, - CP15_TRANSLATION_BASE_CONTROL, - CP15_DOMAIN_ACCESS_CONTROL, - CP15_RESERVED, - - // c5 - Fault status registers - CP15_FAULT_STATUS, - CP15_INSTR_FAULT_STATUS, - CP15_COMBINED_DATA_FSR = CP15_FAULT_STATUS, - CP15_INST_FSR, - - // c6 - Fault Address registers - CP15_FAULT_ADDRESS, - CP15_COMBINED_DATA_FAR = CP15_FAULT_ADDRESS, - CP15_WFAR, - CP15_IFAR, - - // c7 - Cache operation registers - CP15_WAIT_FOR_INTERRUPT, - CP15_PHYS_ADDRESS, - CP15_INVALIDATE_INSTR_CACHE, - CP15_INVALIDATE_INSTR_CACHE_USING_MVA, - CP15_INVALIDATE_INSTR_CACHE_USING_INDEX, - CP15_FLUSH_PREFETCH_BUFFER, - CP15_FLUSH_BRANCH_TARGET_CACHE, - CP15_FLUSH_BRANCH_TARGET_CACHE_ENTRY, - CP15_INVALIDATE_DATA_CACHE, - CP15_INVALIDATE_DATA_CACHE_LINE_USING_MVA, - CP15_INVALIDATE_DATA_CACHE_LINE_USING_INDEX, - CP15_INVALIDATE_DATA_AND_INSTR_CACHE, - CP15_CLEAN_DATA_CACHE, - CP15_CLEAN_DATA_CACHE_LINE_USING_MVA, - CP15_CLEAN_DATA_CACHE_LINE_USING_INDEX, - CP15_DATA_SYNC_BARRIER, - CP15_DATA_MEMORY_BARRIER, - CP15_CLEAN_AND_INVALIDATE_DATA_CACHE, - CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_MVA, - CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_INDEX, - - // c8 - TLB operations - CP15_INVALIDATE_ITLB, - CP15_INVALIDATE_ITLB_SINGLE_ENTRY, - CP15_INVALIDATE_ITLB_ENTRY_ON_ASID_MATCH, - CP15_INVALIDATE_ITLB_ENTRY_ON_MVA, - CP15_INVALIDATE_DTLB, - CP15_INVALIDATE_DTLB_SINGLE_ENTRY, - CP15_INVALIDATE_DTLB_ENTRY_ON_ASID_MATCH, - CP15_INVALIDATE_DTLB_ENTRY_ON_MVA, - CP15_INVALIDATE_UTLB, - CP15_INVALIDATE_UTLB_SINGLE_ENTRY, - CP15_INVALIDATE_UTLB_ENTRY_ON_ASID_MATCH, - CP15_INVALIDATE_UTLB_ENTRY_ON_MVA, - - // c9 - Data cache lockdown register - CP15_DATA_CACHE_LOCKDOWN, - - // c10 - TLB/Memory map registers - CP15_TLB_LOCKDOWN, - CP15_PRIMARY_REGION_REMAP, - CP15_NORMAL_REGION_REMAP, - - // c13 - Thread related registers - CP15_PID, - CP15_CONTEXT_ID, - CP15_THREAD_UPRW, // Thread ID register - User/Privileged Read/Write - CP15_THREAD_URO, // Thread ID register - User Read Only (Privileged R/W) - CP15_THREAD_PRW, // Thread ID register - Privileged R/W only. - - // c15 - Performance and TLB lockdown registers - CP15_PERFORMANCE_MONITOR_CONTROL, - CP15_CYCLE_COUNTER, - CP15_COUNT_0, - CP15_COUNT_1, - CP15_READ_MAIN_TLB_LOCKDOWN_ENTRY, - CP15_WRITE_MAIN_TLB_LOCKDOWN_ENTRY, - CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS, - CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS, - CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE, - CP15_TLB_DEBUG_CONTROL, - - // Skyeye defined - CP15_TLB_FAULT_ADDR, - CP15_TLB_FAULT_STATUS, - - // Not an actual register. - // All registers should be defined above this. - CP15_REGISTER_COUNT, -}; +class ARM_Dynarmic_32; class DynarmicCP15 final : public Dynarmic::A32::Coprocessor { public: using CoprocReg = Dynarmic::A32::CoprocReg; - explicit DynarmicCP15(u32* cp15) : CP15(cp15){}; + explicit DynarmicCP15(ARM_Dynarmic_32& parent) : parent(parent) {} std::optional CompileInternalOperation(bool two, unsigned opc1, CoprocReg CRd, CoprocReg CRn, CoprocReg CRm, @@ -147,6 +34,9 @@ public: std::optional CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd, std::optional option) override; -private: - u32* CP15{}; + ARM_Dynarmic_32& parent; + u32 uprw; + u32 uro; }; + +} // namespace Core From 52bcfac1169b841a1bac6cd017fa9922dd10f4a0 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 17 Jun 2020 16:32:21 +0100 Subject: [PATCH 081/145] arm_dynarmic_cp15: Implement CNTPCT --- src/core/arm/dynarmic/arm_dynarmic_cp15.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp b/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp index 81b2c400f8..d43e4dd706 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp @@ -6,6 +6,9 @@ #include "common/logging/log.h" #include "core/arm/dynarmic/arm_dynarmic_32.h" #include "core/arm/dynarmic/arm_dynarmic_cp15.h" +#include "core/core.h" +#include "core/core_timing.h" +#include "core/core_timing_util.h" using Callback = Dynarmic::A32::Coprocessor::Callback; using CallbackOrAccessOneWord = Dynarmic::A32::Coprocessor::CallbackOrAccessOneWord; @@ -89,6 +92,16 @@ CallbackOrAccessOneWord DynarmicCP15::CompileGetOneWord(bool two, unsigned opc1, } CallbackOrAccessTwoWords DynarmicCP15::CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) { + if (!two && opc == 0 && CRm == CoprocReg::C14) { + // CNTPCT + const auto callback = static_cast( + [](Dynarmic::A32::Jit*, void* arg, u32, u32) -> u64 { + ARM_Dynarmic_32& parent = *(ARM_Dynarmic_32*)arg; + return Timing::CpuCyclesToClockCycles(parent.system.CoreTiming().GetTicks()); + }); + return Dynarmic::A32::Coprocessor::Callback{callback, (void*)&parent}; + } + LOG_CRITICAL(Core_ARM, "CP15: mrrc{} p15, {}, , , {}", two ? "2" : "", opc, CRm); return {}; } From 44f10d9b9f4ac6fb718718a85a5916721e7944e4 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 15 Jun 2020 20:03:32 +0100 Subject: [PATCH 082/145] macro_jit_x64: Inline Engines::Maxwell3D::GetRegisterValue --- src/video_core/macro/macro_jit_x64.cpp | 23 +++++++++++++++++------ src/video_core/macro/macro_jit_x64.h | 1 + 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index d4a97ec7b5..0b2918388b 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -295,12 +295,20 @@ void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { sub(result, opcode.immediate * -1); } } - Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - mov(Common::X64::ABI_PARAM1, qword[STATE]); - mov(Common::X64::ABI_PARAM2, RESULT); - Common::X64::CallFarFunction(*this, &Read); - Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); - mov(RESULT, Common::X64::ABI_RETURN.cvt32()); + + // Equivalent to Engines::Maxwell3D::GetRegisterValue: + if (optimizer.enable_asserts) { + Xbyak::Label pass_range_check; + cmp(RESULT, static_cast(Engines::Maxwell3D::Regs::NUM_REGS)); + jb(pass_range_check); + int3(); + L(pass_range_check); + } + mov(rax, qword[STATE]); + mov(RESULT, + dword[rax + offsetof(Engines::Maxwell3D, regs) + + offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]); + Compile_ProcessResult(opcode.result_operation, opcode.dst); } @@ -435,6 +443,9 @@ void MacroJITx64Impl::Compile() { // one if our register isn't "dirty" optimizer.optimize_for_method_move = true; + // Enable run-time assertions in JITted code + optimizer.enable_asserts = false; + // Check to see if we can skip emitting certain instructions Optimizer_ScanFlags(); diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index 51ec090b85..a180e74283 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -76,6 +76,7 @@ private: bool zero_reg_skip{}; bool skip_dummy_addimmediate{}; bool optimize_for_method_move{}; + bool enable_asserts{}; }; OptimizerState optimizer{}; From 8868fb745f5adaab83a26d74ebce35ed371adffe Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Sat, 13 Jun 2020 08:17:37 -0400 Subject: [PATCH 083/145] maxwell_to_gl: Miscellaneous changes maxwell_to_gl: Log unimplemented features under UNIMPLEMENTED_MSG instead of LOG_ERROR to bring into parity with maxwell_to_vk maxwell_to_gl: Deduplicate logging in VertexType(), merging them into one. maxwell_to_gl: Return GL_NEAREST instead of GL_LINEAR if an unknown texture filter mode is encountered. maxwell_to_gl: Log the mipmap filter mode if an unknown value is passed in. maxwell_to_gl: Reorder filtering modes to start with None, then Nearest, then Linear. --- .../renderer_opengl/maxwell_to_gl.h | 86 ++++++++----------- 1 file changed, 36 insertions(+), 50 deletions(-) diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index 994ae98eb5..35e3292403 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -46,10 +46,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { return GL_UNSIGNED_INT; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_UNSIGNED_INT_2_10_10_10_REV; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; } + break; case Maxwell::VertexAttribute::Type::SignedInt: case Maxwell::VertexAttribute::Type::SignedNorm: switch (attrib.size) { @@ -70,10 +68,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { return GL_INT; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_INT_2_10_10_10_REV; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; } + break; case Maxwell::VertexAttribute::Type::Float: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_16: @@ -86,10 +82,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_32_32_32: case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return GL_FLOAT; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; } + break; case Maxwell::VertexAttribute::Type::UnsignedScaled: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_8: @@ -102,10 +96,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_16_16_16: case Maxwell::VertexAttribute::Size::Size_16_16_16_16: return GL_UNSIGNED_SHORT; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; } + break; case Maxwell::VertexAttribute::Type::SignedScaled: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_8: @@ -118,14 +110,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_16_16_16: case Maxwell::VertexAttribute::Size::Size_16_16_16_16: return GL_SHORT; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; } - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString()); - return {}; + break; } + UNIMPLEMENTED_MSG("Unimplemented vertex type={} and size={}", attrib.TypeString(), + attrib.SizeString()); + return {}; } inline GLenum IndexFormat(Maxwell::IndexFormat index_format) { @@ -137,8 +127,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) { case Maxwell::IndexFormat::UnsignedInt: return GL_UNSIGNED_INT; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast(index_format)); - UNREACHABLE(); + UNREACHABLE_MSG("Invalid index_format={}", static_cast(index_format)); return {}; } @@ -180,10 +169,20 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) { } inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode, - Tegra::Texture::TextureMipmapFilter mip_filter_mode) { + Tegra::Texture::TextureMipmapFilter mipmap_filter_mode) { switch (filter_mode) { - case Tegra::Texture::TextureFilter::Linear: { - switch (mip_filter_mode) { + case Tegra::Texture::TextureFilter::Nearest: + switch (mipmap_filter_mode) { + case Tegra::Texture::TextureMipmapFilter::None: + return GL_NEAREST; + case Tegra::Texture::TextureMipmapFilter::Nearest: + return GL_NEAREST_MIPMAP_NEAREST; + case Tegra::Texture::TextureMipmapFilter::Linear: + return GL_NEAREST_MIPMAP_LINEAR; + } + break; + case Tegra::Texture::TextureFilter::Linear: + switch (mipmap_filter_mode) { case Tegra::Texture::TextureMipmapFilter::None: return GL_LINEAR; case Tegra::Texture::TextureMipmapFilter::Nearest: @@ -193,20 +192,9 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode, } break; } - case Tegra::Texture::TextureFilter::Nearest: { - switch (mip_filter_mode) { - case Tegra::Texture::TextureMipmapFilter::None: - return GL_NEAREST; - case Tegra::Texture::TextureMipmapFilter::Nearest: - return GL_NEAREST_MIPMAP_NEAREST; - case Tegra::Texture::TextureMipmapFilter::Linear: - return GL_NEAREST_MIPMAP_LINEAR; - } - break; - } - } - LOG_ERROR(Render_OpenGL, "Unimplemented texture filter mode={}", static_cast(filter_mode)); - return GL_LINEAR; + UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}", + static_cast(filter_mode), static_cast(mipmap_filter_mode)); + return GL_NEAREST; } inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) { @@ -229,10 +217,9 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) { } else { return GL_MIRROR_CLAMP_TO_EDGE; } - default: - LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast(wrap_mode)); - return GL_REPEAT; } + UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast(wrap_mode)); + return GL_REPEAT; } inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) { @@ -254,8 +241,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) { case Tegra::Texture::DepthCompareFunc::Always: return GL_ALWAYS; } - LOG_ERROR(Render_OpenGL, "Unimplemented texture depth compare function ={}", - static_cast(func)); + UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast(func)); return GL_GREATER; } @@ -277,7 +263,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) { case Maxwell::Blend::Equation::MaxGL: return GL_MAX; } - LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast(equation)); + UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast(equation)); return GL_FUNC_ADD; } @@ -341,7 +327,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) { case Maxwell::Blend::Factor::OneMinusConstantAlphaGL: return GL_ONE_MINUS_CONSTANT_ALPHA; } - LOG_ERROR(Render_OpenGL, "Unimplemented blend factor={}", static_cast(factor)); + UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast(factor)); return GL_ZERO; } @@ -361,7 +347,7 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) { case Tegra::Texture::SwizzleSource::OneFloat: return GL_ONE; } - LOG_ERROR(Render_OpenGL, "Unimplemented swizzle source={}", static_cast(source)); + UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast(source)); return GL_ZERO; } @@ -392,7 +378,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) { case Maxwell::ComparisonOp::AlwaysOld: return GL_ALWAYS; } - LOG_ERROR(Render_OpenGL, "Unimplemented comparison op={}", static_cast(comparison)); + UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast(comparison)); return GL_ALWAYS; } @@ -423,7 +409,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) { case Maxwell::StencilOp::DecrWrapOGL: return GL_DECR_WRAP; } - LOG_ERROR(Render_OpenGL, "Unimplemented stencil op={}", static_cast(stencil)); + UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast(stencil)); return GL_KEEP; } @@ -434,7 +420,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) { case Maxwell::FrontFace::CounterClockWise: return GL_CCW; } - LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast(front_face)); + UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast(front_face)); return GL_CCW; } @@ -447,7 +433,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) { case Maxwell::CullFace::FrontAndBack: return GL_FRONT_AND_BACK; } - LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast(cull_face)); + UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast(cull_face)); return GL_BACK; } @@ -486,7 +472,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) { case Maxwell::LogicOperation::Set: return GL_SET; } - LOG_ERROR(Render_OpenGL, "Unimplemented logic operation={}", static_cast(operation)); + UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast(operation)); return GL_COPY; } From be660e77490b65d004a77d47bca1c443eb15482c Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Thu, 18 Jun 2020 04:42:40 -0400 Subject: [PATCH 084/145] maxwell_to_vk: Reorder filter cases and correct mipmap_filter=None maxwell_to_vk: Reorder filtering modes to start with None, then Nearest, then Linear. maxwell_to_vk: Logs filter modes under UNREACHABLE_MSG instead of UNIMPLEMENTED_MSG, since any unknown filter modes are invalid and not unimplemented. maxwell_to_vk: Return VK_SAMPLER_MIPMAP_MODE_NEAREST instead of VK_SAMPLER_MIPMAP_MODE_LINEAR when mipmap_filter is None with the description from the VkSamplerCreateInfo(3) man page. --- .../renderer_vulkan/maxwell_to_vk.cpp | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 62e950d310..1f2b6734ba 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -21,29 +21,29 @@ namespace Sampler { VkFilter Filter(Tegra::Texture::TextureFilter filter) { switch (filter) { - case Tegra::Texture::TextureFilter::Linear: - return VK_FILTER_LINEAR; case Tegra::Texture::TextureFilter::Nearest: return VK_FILTER_NEAREST; + case Tegra::Texture::TextureFilter::Linear: + return VK_FILTER_LINEAR; } - UNIMPLEMENTED_MSG("Unimplemented sampler filter={}", static_cast(filter)); + UNREACHABLE_MSG("Invalid sampler filter={}", static_cast(filter)); return {}; } VkSamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter) { switch (mipmap_filter) { case Tegra::Texture::TextureMipmapFilter::None: - // TODO(Rodrigo): None seems to be mapped to OpenGL's mag and min filters without mipmapping - // (e.g. GL_NEAREST and GL_LINEAR). Vulkan doesn't have such a thing, find out if we have to - // use an image view with a single mipmap level to emulate this. - return VK_SAMPLER_MIPMAP_MODE_LINEAR; - ; - case Tegra::Texture::TextureMipmapFilter::Linear: - return VK_SAMPLER_MIPMAP_MODE_LINEAR; + // There are no Vulkan filter modes that directly correspond to OpenGL minification filters + // of GL_LINEAR or GL_NEAREST, but they can be emulated using + // VK_SAMPLER_MIPMAP_MODE_NEAREST, minLod = 0, and maxLod = 0.25, and using minFilter = + // VK_FILTER_LINEAR or minFilter = VK_FILTER_NEAREST, respectively. + return VK_SAMPLER_MIPMAP_MODE_NEAREST; case Tegra::Texture::TextureMipmapFilter::Nearest: return VK_SAMPLER_MIPMAP_MODE_NEAREST; + case Tegra::Texture::TextureMipmapFilter::Linear: + return VK_SAMPLER_MIPMAP_MODE_LINEAR; } - UNIMPLEMENTED_MSG("Unimplemented sampler mipmap mode={}", static_cast(mipmap_filter)); + UNREACHABLE_MSG("Invalid sampler mipmap mode={}", static_cast(mipmap_filter)); return {}; } @@ -78,10 +78,9 @@ VkSamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode w case Tegra::Texture::WrapMode::MirrorOnceBorder: UNIMPLEMENTED(); return VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE; - default: - UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast(wrap_mode)); - return {}; } + UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast(wrap_mode)); + return {}; } VkCompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func) { @@ -288,10 +287,9 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device, return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; case Maxwell::PrimitiveTopology::Patches: return VK_PRIMITIVE_TOPOLOGY_PATCH_LIST; - default: - UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast(topology)); - return {}; } + UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast(topology)); + return {}; } VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) { From 2f420618ea88c897b8607a2e1bfd0b34fa7c2017 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Thu, 18 Jun 2020 04:43:06 -0400 Subject: [PATCH 085/145] vk_sampler_cache: Emulate GL_LINEAR/NEAREST minification filters Emulate GL_LINEAR/NEAREST minification filters using minLod = 0 and maxLod = 0.25 during sampler creation --- src/video_core/renderer_vulkan/vk_sampler_cache.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp index e6f2fa553e..616eacc368 100644 --- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp @@ -9,6 +9,8 @@ #include "video_core/renderer_vulkan/wrapper.h" #include "video_core/textures/texture.h" +using Tegra::Texture::TextureMipmapFilter; + namespace Vulkan { namespace { @@ -63,8 +65,8 @@ vk::Sampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) c ci.maxAnisotropy = tsc.GetMaxAnisotropy(); ci.compareEnable = tsc.depth_compare_enabled; ci.compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func); - ci.minLod = tsc.GetMinLod(); - ci.maxLod = tsc.GetMaxLod(); + ci.minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.GetMinLod(); + ci.maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.GetMaxLod(); ci.borderColor = arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color); ci.unnormalizedCoordinates = VK_FALSE; return device.GetLogical().CreateSampler(ci); From 8ae715454157f4fdc615ee45b914dfa274c3e702 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 17 Jun 2020 12:22:09 +0100 Subject: [PATCH 086/145] Rename PAGE_SHIFT to PAGE_BITS macOS header files #define PAGE_SHIFT --- src/video_core/query_cache.h | 10 +++++----- src/video_core/shader_cache.h | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index 2f75f88015..e12dab8999 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -220,8 +220,8 @@ private: return cache_begin < addr_end && addr_begin < cache_end; }; - const u64 page_end = addr_end >> PAGE_SHIFT; - for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) { + const u64 page_end = addr_end >> PAGE_BITS; + for (u64 page = addr_begin >> PAGE_BITS; page <= page_end; ++page) { const auto& it = cached_queries.find(page); if (it == std::end(cached_queries)) { continue; @@ -242,14 +242,14 @@ private: /// Registers the passed parameters as cached and returns a pointer to the stored cached query. CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) { rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1); - const u64 page = static_cast(cpu_addr) >> PAGE_SHIFT; + const u64 page = static_cast(cpu_addr) >> PAGE_BITS; return &cached_queries[page].emplace_back(static_cast(*this), type, cpu_addr, host_ptr); } /// Tries to a get a cached query. Returns nullptr on failure. CachedQuery* TryGet(VAddr addr) { - const u64 page = static_cast(addr) >> PAGE_SHIFT; + const u64 page = static_cast(addr) >> PAGE_BITS; const auto it = cached_queries.find(page); if (it == std::end(cached_queries)) { return nullptr; @@ -268,7 +268,7 @@ private: } static constexpr std::uintptr_t PAGE_SIZE = 4096; - static constexpr unsigned PAGE_SHIFT = 12; + static constexpr unsigned PAGE_BITS = 12; Core::System& system; VideoCore::RasterizerInterface& rasterizer; diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h index a23c238868..2dd270e99e 100644 --- a/src/video_core/shader_cache.h +++ b/src/video_core/shader_cache.h @@ -19,7 +19,7 @@ namespace VideoCommon { template class ShaderCache { - static constexpr u64 PAGE_SHIFT = 14; + static constexpr u64 PAGE_BITS = 14; struct Entry { VAddr addr_start; @@ -87,8 +87,8 @@ protected: const VAddr addr_end = addr + size; Entry* const entry = NewEntry(addr, addr_end, data.get()); - const u64 page_end = addr_end >> PAGE_SHIFT; - for (u64 page = addr >> PAGE_SHIFT; page <= page_end; ++page) { + const u64 page_end = addr_end >> PAGE_BITS; + for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) { invalidation_cache[page].push_back(entry); } @@ -108,8 +108,8 @@ private: /// @pre invalidation_mutex is locked void InvalidatePagesInRegion(VAddr addr, std::size_t size) { const VAddr addr_end = addr + size; - const u64 page_end = addr_end >> PAGE_SHIFT; - for (u64 page = addr >> PAGE_SHIFT; page <= page_end; ++page) { + const u64 page_end = addr_end >> PAGE_BITS; + for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) { const auto it = invalidation_cache.find(page); if (it == invalidation_cache.end()) { continue; From 442e48ef4c28fb22115bd30e9e988b91ae02ee84 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 17 Jun 2020 12:22:50 +0100 Subject: [PATCH 087/145] memory_util: boost hashes are size_t * boost::hash_value returns a size_t * boost::hash_combine takes a size_t& argument --- src/video_core/shader/memory_util.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/shader/memory_util.cpp b/src/video_core/shader/memory_util.cpp index 074f21691c..5071c83ca8 100644 --- a/src/video_core/shader/memory_util.cpp +++ b/src/video_core/shader/memory_util.cpp @@ -66,12 +66,12 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, GPUVAddr gpu_add u64 GetUniqueIdentifier(Tegra::Engines::ShaderType shader_type, bool is_a, const ProgramCode& code, const ProgramCode& code_b) { - u64 unique_identifier = boost::hash_value(code); + size_t unique_identifier = boost::hash_value(code); if (is_a) { // VertexA programs include two programs boost::hash_combine(unique_identifier, boost::hash_value(code_b)); } - return unique_identifier; + return static_cast(unique_identifier); } } // namespace VideoCommon::Shader From b1eada607960802e9f5cd33f3f9ec8d1aa5dd85e Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 17 Jun 2020 12:25:06 +0100 Subject: [PATCH 088/145] renderer_vulkan: Fix macOS GetBundleDirectory reference --- src/video_core/renderer_vulkan/renderer_vulkan.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 59b4419438..cd9673d1ff 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -13,6 +13,7 @@ #include #include "common/dynamic_library.h" +#include "common/file_util.h" #include "common/logging/log.h" #include "common/telemetry.h" #include "core/core.h" @@ -76,7 +77,8 @@ Common::DynamicLibrary OpenVulkanLibrary() { char* libvulkan_env = getenv("LIBVULKAN_PATH"); if (!libvulkan_env || !library.Open(libvulkan_env)) { // Use the libvulkan.dylib from the application bundle. - std::string filename = File::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib"; + const std::string filename = + FileUtil::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib"; library.Open(filename.c_str()); } #else From 69f38355ed04b337a49c55690e2e05e98dc86de6 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 17 Jun 2020 12:26:17 +0100 Subject: [PATCH 089/145] vk_rasterizer: BindTransformFeedbackBuffersEXT accepts a size of type VkDeviceSize --- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 184b2238a7..a77fa35c38 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -870,7 +870,7 @@ void RasterizerVulkan::BeginTransformFeedback() { UNIMPLEMENTED_IF(binding.buffer_offset != 0); const GPUVAddr gpu_addr = binding.Address(); - const std::size_t size = binding.buffer_size; + const auto size = static_cast(binding.buffer_size); const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); scheduler.Record([buffer = buffer, offset = offset, size](vk::CommandBuffer cmdbuf) { From 4f09f0aea4f31d381f727e9d6a79ee04ab8ac44f Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 17 Jun 2020 12:27:41 +0100 Subject: [PATCH 090/145] shared_font: Service::NS::EncryptSharedFont takes a size_t& --- src/core/file_sys/system_archive/shared_font.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/file_sys/system_archive/shared_font.cpp b/src/core/file_sys/system_archive/shared_font.cpp index 2c05eb42e2..c5cdf7d9bc 100644 --- a/src/core/file_sys/system_archive/shared_font.cpp +++ b/src/core/file_sys/system_archive/shared_font.cpp @@ -23,7 +23,7 @@ VirtualFile PackBFTTF(const std::array& data, const std::string& name) std::vector bfttf(Size + sizeof(u64)); - u64 offset = 0; + size_t offset = 0; Service::NS::EncryptSharedFont(vec, bfttf, offset); return std::make_shared(std::move(bfttf), name); } From b19fe55f8420ab6f6b8ef35ef7d80be54a02ce17 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 17 Jun 2020 12:30:06 +0100 Subject: [PATCH 091/145] memory_manager: Explicitly specifcy std::min --- src/core/hle/kernel/memory/memory_manager.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/hle/kernel/memory/memory_manager.cpp b/src/core/hle/kernel/memory/memory_manager.cpp index 6b432e1b2c..6161481905 100644 --- a/src/core/hle/kernel/memory/memory_manager.cpp +++ b/src/core/hle/kernel/memory/memory_manager.cpp @@ -104,7 +104,7 @@ ResultCode MemoryManager::Allocate(PageLinkedList& page_list, std::size_t num_pa // Ensure that we don't leave anything un-freed auto group_guard = detail::ScopeExit([&] { for (const auto& it : page_list.Nodes()) { - const auto min_num_pages{std::min( + const auto min_num_pages{std::min( it.GetNumPages(), (chosen_manager.GetEndAddress() - it.GetAddress()) / PageSize)}; chosen_manager.Free(it.GetAddress(), min_num_pages); } @@ -165,7 +165,7 @@ ResultCode MemoryManager::Free(PageLinkedList& page_list, std::size_t num_pages, // Free all of the pages for (const auto& it : page_list.Nodes()) { - const auto min_num_pages{std::min( + const auto min_num_pages{std::min( it.GetNumPages(), (chosen_manager.GetEndAddress() - it.GetAddress()) / PageSize)}; chosen_manager.Free(it.GetAddress(), min_num_pages); } From 778f86989a446e73133d893c6588c728e6fb2206 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 17 Jun 2020 12:37:15 +0100 Subject: [PATCH 092/145] bootmanager: Remove references to OpenGL for macOS OpenGL macOS headers definitions clash heavily with each other --- src/yuzu/CMakeLists.txt | 4 ++++ src/yuzu/bootmanager.cpp | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/yuzu/CMakeLists.txt b/src/yuzu/CMakeLists.txt index 8b94047189..75c27e39e5 100644 --- a/src/yuzu/CMakeLists.txt +++ b/src/yuzu/CMakeLists.txt @@ -208,6 +208,10 @@ if (MSVC) copy_yuzu_unicorn_deps(yuzu) endif() +if (NOT APPLE) + target_compile_definitions(yuzu PRIVATE HAS_OPENGL) +endif() + if (ENABLE_VULKAN) target_include_directories(yuzu PRIVATE ../../externals/Vulkan-Headers/include) target_compile_definitions(yuzu PRIVATE HAS_VULKAN) diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp index 1f5e43043c..696da2137c 100644 --- a/src/yuzu/bootmanager.cpp +++ b/src/yuzu/bootmanager.cpp @@ -8,13 +8,16 @@ #include #include #include -#include -#include #include #include #include #include +#ifdef HAS_OPENGL +#include +#include +#endif + #if !defined(WIN32) && HAS_VULKAN #include #endif @@ -98,6 +101,7 @@ void EmuThread::run() { #endif } +#ifdef HAS_OPENGL class OpenGLSharedContext : public Core::Frontend::GraphicsContext { public: /// Create the original context that should be shared from @@ -183,6 +187,7 @@ private: std::unique_ptr offscreen_surface{}; QSurface* surface; }; +#endif class DummyContext : public Core::Frontend::GraphicsContext {}; @@ -473,6 +478,7 @@ void GRenderWindow::resizeEvent(QResizeEvent* event) { } std::unique_ptr GRenderWindow::CreateSharedContext() const { +#ifdef HAS_OPENGL if (Settings::values.renderer_backend == Settings::RendererBackend::OpenGL) { auto c = static_cast(main_context.get()); // Bind the shared contexts to the main surface in case the backend wants to take over @@ -480,6 +486,7 @@ std::unique_ptr GRenderWindow::CreateSharedCont return std::make_unique(c->GetShareContext(), child_widget->windowHandle()); } +#endif return std::make_unique(); } @@ -560,6 +567,7 @@ void GRenderWindow::OnMinimalClientAreaChangeRequest(std::pair minimal } bool GRenderWindow::InitializeOpenGL() { +#ifdef HAS_OPENGL // TODO: One of these flags might be interesting: WA_OpaquePaintEvent, WA_NoBackground, // WA_DontShowOnScreen, WA_DeleteOnClose auto child = new OpenGLRenderWidget(this); @@ -571,6 +579,11 @@ bool GRenderWindow::InitializeOpenGL() { std::make_unique(context->GetShareContext(), child->windowHandle())); return true; +#else + QMessageBox::warning(this, tr("OpenGL not available!"), + tr("yuzu has not been compiled with OpenGL support.")); + return false; +#endif } bool GRenderWindow::InitializeVulkan() { From 778043a44c005fb979bca0ff5de85e340390609d Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 18 Jun 2020 16:52:15 -0300 Subject: [PATCH 093/145] arm_dynarmic_32: Fix implicit conversion error in SetTPIDR_EL0 On MSVC builds we treat conversion warnings as errors. --- src/core/arm/dynarmic/arm_dynarmic_32.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp index e7456a8c34..19d798dc70 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp @@ -165,7 +165,7 @@ u64 ARM_Dynarmic_32::GetTPIDR_EL0() const { } void ARM_Dynarmic_32::SetTPIDR_EL0(u64 value) { - cp15->uprw = value; + cp15->uprw = static_cast(value); } void ARM_Dynarmic_32::SaveContext(ThreadContext32& ctx) { From 7d763f060eb0fe151a629aa36cce3d7ce076e12a Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 18 Jun 2020 17:47:19 -0300 Subject: [PATCH 094/145] vk_update_descriptor: Upload descriptor sets data directly Instead of copying to a temporary payload before sending the update task to the worker thread, insert elements to the payload directly. --- .../renderer_vulkan/vk_rasterizer.cpp | 4 +-- .../renderer_vulkan/vk_update_descriptor.cpp | 36 +++++++------------ .../renderer_vulkan/vk_update_descriptor.h | 32 ++++++++--------- 3 files changed, 30 insertions(+), 42 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 184b2238a7..91da9ff800 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -1154,7 +1154,7 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu const auto sampler = sampler_cache.GetSampler(texture.tsc); update_descriptor_queue.AddSampledImage(sampler, image_view); - const auto image_layout = update_descriptor_queue.GetLastImageLayout(); + VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout(); *image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; sampled_views.push_back(ImageView{std::move(view), image_layout}); } @@ -1180,7 +1180,7 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source); update_descriptor_queue.AddImage(image_view); - const auto image_layout = update_descriptor_queue.GetLastImageLayout(); + VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout(); *image_layout = VK_IMAGE_LAYOUT_GENERAL; image_views.push_back(ImageView{std::move(view), image_layout}); } diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp index 681ecde984..351c048d23 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp @@ -24,35 +24,25 @@ void VKUpdateDescriptorQueue::TickFrame() { } void VKUpdateDescriptorQueue::Acquire() { - entries.clear(); -} + // Minimum number of entries required. + // This is the maximum number of entries a single draw call migth use. + static constexpr std::size_t MIN_ENTRIES = 0x400; -void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template, - VkDescriptorSet set) { - if (payload.size() + entries.size() >= payload.max_size()) { + if (payload.size() + MIN_ENTRIES >= payload.max_size()) { LOG_WARNING(Render_Vulkan, "Payload overflow, waiting for worker thread"); scheduler.WaitWorker(); payload.clear(); } + upload_start = &*payload.end(); +} - // TODO(Rodrigo): Rework to write the payload directly - const auto payload_start = payload.data() + payload.size(); - for (const auto& entry : entries) { - if (const auto image = std::get_if(&entry)) { - payload.push_back(*image); - } else if (const auto buffer = std::get_if(&entry)) { - payload.push_back(*buffer); - } else if (const auto texel = std::get_if(&entry)) { - payload.push_back(*texel); - } else { - UNREACHABLE(); - } - } - - scheduler.Record( - [payload_start, set, update_template, logical = &device.GetLogical()](vk::CommandBuffer) { - logical->UpdateDescriptorSet(set, update_template, payload_start); - }); +void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template, + VkDescriptorSet set) { + const void* const data = upload_start; + const vk::Device* const logical = &device.GetLogical(); + scheduler.Record([data, logical, set, update_template](vk::CommandBuffer) { + logical->UpdateDescriptorSet(set, update_template, data); + }); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h index cc7e3dff45..945320c724 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.h +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h @@ -15,17 +15,13 @@ namespace Vulkan { class VKDevice; class VKScheduler; -class DescriptorUpdateEntry { -public: - explicit DescriptorUpdateEntry() {} +struct DescriptorUpdateEntry { + DescriptorUpdateEntry(VkDescriptorImageInfo image_) : image{image_} {} - DescriptorUpdateEntry(VkDescriptorImageInfo image) : image{image} {} + DescriptorUpdateEntry(VkDescriptorBufferInfo buffer_) : buffer{buffer_} {} - DescriptorUpdateEntry(VkDescriptorBufferInfo buffer) : buffer{buffer} {} + DescriptorUpdateEntry(VkBufferView texel_buffer_) : texel_buffer{texel_buffer_} {} - DescriptorUpdateEntry(VkBufferView texel_buffer) : texel_buffer{texel_buffer} {} - -private: union { VkDescriptorImageInfo image; VkDescriptorBufferInfo buffer; @@ -45,32 +41,34 @@ public: void Send(VkDescriptorUpdateTemplateKHR update_template, VkDescriptorSet set); void AddSampledImage(VkSampler sampler, VkImageView image_view) { - entries.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}}); + payload.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}}); } void AddImage(VkImageView image_view) { - entries.emplace_back(VkDescriptorImageInfo{{}, image_view, {}}); + payload.emplace_back(VkDescriptorImageInfo{{}, image_view, {}}); } void AddBuffer(VkBuffer buffer, u64 offset, std::size_t size) { - entries.emplace_back(VkDescriptorBufferInfo{buffer, offset, size}); + payload.emplace_back(VkDescriptorBufferInfo{buffer, offset, size}); } void AddTexelBuffer(VkBufferView texel_buffer) { - entries.emplace_back(texel_buffer); + payload.emplace_back(texel_buffer); } - VkImageLayout* GetLastImageLayout() { - return &std::get(entries.back()).imageLayout; + VkImageLayout* LastImageLayout() { + return &payload.back().image.imageLayout; + } + + const VkImageLayout* LastImageLayout() const { + return &payload.back().image.imageLayout; } private: - using Variant = std::variant; - const VKDevice& device; VKScheduler& scheduler; - boost::container::static_vector entries; + const DescriptorUpdateEntry* upload_start = nullptr; boost::container::static_vector payload; }; From 977ceb405627adfec8be6240521d1db8842b8fc2 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Fri, 19 Jun 2020 11:39:41 +0100 Subject: [PATCH 095/145] macro_jit_x64: Remove unused function Read --- src/video_core/macro/macro_jit_x64.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 0b2918388b..80a00ba773 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -270,14 +270,6 @@ void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { Compile_ProcessResult(opcode.result_operation, opcode.dst); } -static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) { - return maxwell3d->GetRegisterValue(method); -} - -static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) { - maxwell3d->CallMethodFromMME(method_address.address, value); -} - void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { if (optimizer.zero_reg_skip && opcode.src_a == 0) { if (opcode.immediate == 0) { @@ -312,6 +304,10 @@ void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { Compile_ProcessResult(opcode.result_operation, opcode.dst); } +static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) { + maxwell3d->CallMethodFromMME(method_address.address, value); +} + void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) { Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); mov(Common::X64::ABI_PARAM1, qword[STATE]); From c7ed7d94279082600eb75218ebe8b4cc6a3ab0d0 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Fri, 19 Jun 2020 22:17:56 +1000 Subject: [PATCH 096/145] Fix compilation when not building with boxcat Fixes compilation when trying to build without boxcat enabled --- src/core/CMakeLists.txt | 4 ++-- src/yuzu/configuration/configure_service.cpp | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 47418006b2..cb9ced5c90 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -606,11 +606,11 @@ endif() create_target_directory_groups(core) target_link_libraries(core PUBLIC common PRIVATE audio_core video_core) -target_link_libraries(core PUBLIC Boost::boost PRIVATE fmt::fmt nlohmann_json::nlohmann_json mbedtls Opus::Opus unicorn) +target_link_libraries(core PUBLIC Boost::boost PRIVATE fmt::fmt nlohmann_json::nlohmann_json mbedtls Opus::Opus unicorn zip) if (YUZU_ENABLE_BOXCAT) target_compile_definitions(core PRIVATE -DYUZU_ENABLE_BOXCAT) - target_link_libraries(core PRIVATE httplib nlohmann_json::nlohmann_json zip) + target_link_libraries(core PRIVATE httplib nlohmann_json::nlohmann_json) endif() if (ENABLE_WEB_SERVICE) diff --git a/src/yuzu/configuration/configure_service.cpp b/src/yuzu/configuration/configure_service.cpp index 06566e981b..92a1737b34 100644 --- a/src/yuzu/configuration/configure_service.cpp +++ b/src/yuzu/configuration/configure_service.cpp @@ -68,6 +68,7 @@ void ConfigureService::SetConfiguration() { } std::pair ConfigureService::BCATDownloadEvents() { +#ifdef YUZU_ENABLE_BOXCAT std::optional global; std::map map; const auto res = Service::BCAT::Boxcat::GetStatus(global, map); @@ -106,6 +107,10 @@ std::pair ConfigureService::BCATDownloadEvents() { .arg(FormatEventStatusString(value)); } return {QStringLiteral("Current Boxcat Events"), std::move(out)}; +#else + return {QStringLiteral("Current Boxcat Events"), + tr("There are currently no events on boxcat.")}; +#endif } void ConfigureService::OnBCATImplChanged() { From 723639311469b6abb21d766fedad8fad3733106a Mon Sep 17 00:00:00 2001 From: MerryMage Date: Fri, 19 Jun 2020 14:29:09 +0100 Subject: [PATCH 097/145] mii_model: Remove redundant std::move Named return value optimization automatically applies here. --- src/core/file_sys/system_archive/mii_model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/file_sys/system_archive/mii_model.cpp b/src/core/file_sys/system_archive/mii_model.cpp index 6a9add87c6..61bb679459 100644 --- a/src/core/file_sys/system_archive/mii_model.cpp +++ b/src/core/file_sys/system_archive/mii_model.cpp @@ -40,7 +40,7 @@ VirtualDir MiiModel() { out->AddFile(std::make_shared>( MiiModelData::SHAPE_MID, "ShapeMid.dat")); - return std::move(out); + return out; } } // namespace FileSys::SystemArchive From 8272f53cf925c655e9cde49aaa63ba1766e89c0a Mon Sep 17 00:00:00 2001 From: MerryMage Date: Fri, 19 Jun 2020 14:29:36 +0100 Subject: [PATCH 098/145] input_common/keyboard: Remove redundant move Named return value optimization automatically applies here. --- src/input_common/keyboard.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/input_common/keyboard.cpp b/src/input_common/keyboard.cpp index 078374be54..afb8e6612c 100644 --- a/src/input_common/keyboard.cpp +++ b/src/input_common/keyboard.cpp @@ -76,7 +76,7 @@ std::unique_ptr Keyboard::Create(const Common::ParamPackage int key_code = params.Get("code", 0); std::unique_ptr button = std::make_unique(key_button_list); key_button_list->AddKeyButton(key_code, button.get()); - return std::move(button); + return button; } void Keyboard::PressKey(int key_code) { From c6a963c48e0a892588060bcc446b579f17fdd19c Mon Sep 17 00:00:00 2001 From: MerryMage Date: Fri, 19 Jun 2020 14:29:59 +0100 Subject: [PATCH 099/145] input_common/motion_emu: Remove redundant move Named return value optimization automatically applies here. --- src/input_common/motion_emu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/input_common/motion_emu.cpp b/src/input_common/motion_emu.cpp index 8682516285..d4cdf76a3f 100644 --- a/src/input_common/motion_emu.cpp +++ b/src/input_common/motion_emu.cpp @@ -145,7 +145,7 @@ std::unique_ptr MotionEmu::Create(const Common::ParamPackag // Previously created device is disconnected here. Having two motion devices for 3DS is not // expected. current_device = device_wrapper->device; - return std::move(device_wrapper); + return device_wrapper; } void MotionEmu::BeginTilt(int x, int y) { From 4514b80b3eedff01e994f225ea3d2da292c23e01 Mon Sep 17 00:00:00 2001 From: Lioncash Date: Fri, 19 Jun 2020 21:55:00 -0400 Subject: [PATCH 100/145] buffer_cache: Eliminate local variable shadowing We can just make use of the instance in the scope above this one. --- src/video_core/buffer_cache/buffer_cache.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 308d8b55f7..bae1d527cf 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -47,7 +47,7 @@ public: bool is_written = false, bool use_fast_cbuf = false) { std::lock_guard lock{mutex}; - const auto& memory_manager = system.GPU().MemoryManager(); + auto& memory_manager = system.GPU().MemoryManager(); const std::optional cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr); if (!cpu_addr_opt) { return {GetEmptyBuffer(size), 0}; @@ -59,7 +59,6 @@ public: constexpr std::size_t max_stream_size = 0x800; if (use_fast_cbuf || size < max_stream_size) { if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) { - auto& memory_manager = system.GPU().MemoryManager(); const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size); if (use_fast_cbuf) { u8* dest; From 811bff009eca0d0fa2ddb1455fc73fdaec4474da Mon Sep 17 00:00:00 2001 From: Lioncash Date: Fri, 19 Jun 2020 21:57:41 -0400 Subject: [PATCH 101/145] macro_jit_x64: Eliminate variable shadowing in Compile_ProcessResult() We can reduce the capture scope so that it's not possible for both "reg" variables to clash with one another. While we're at it, we can prevent unnecessary copies while we're at it. --- src/video_core/macro/macro_jit_x64.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index bee34a7c0c..9eface47ed 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -546,7 +546,7 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { } void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { - auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) { + const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) { // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero // register. if (reg == 0) { @@ -554,7 +554,7 @@ void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u3 } mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result); }; - auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); }; + const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); }; switch (operation) { case Macro::ResultOperation::IgnoreAndFetch: From a7fe6dc232ec20a5da2f41807e428e666748d9d8 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Sat, 20 Jun 2020 11:57:51 +1000 Subject: [PATCH 102/145] Add translation of "Current Boxcat Events" --- src/yuzu/configuration/configure_service.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/yuzu/configuration/configure_service.cpp b/src/yuzu/configuration/configure_service.cpp index 92a1737b34..0de7a4f0bb 100644 --- a/src/yuzu/configuration/configure_service.cpp +++ b/src/yuzu/configuration/configure_service.cpp @@ -106,10 +106,9 @@ std::pair ConfigureService::BCATDownloadEvents() { .arg(QString::fromStdString(key)) .arg(FormatEventStatusString(value)); } - return {QStringLiteral("Current Boxcat Events"), std::move(out)}; + return {tr("Current Boxcat Events"), std::move(out)}; #else - return {QStringLiteral("Current Boxcat Events"), - tr("There are currently no events on boxcat.")}; + return {tr("Current Boxcat Events"), tr("There are currently no events on boxcat.")}; #endif } From 479605b3e5a3b88128455b8357da471c713d0f90 Mon Sep 17 00:00:00 2001 From: Lioncash Date: Fri, 19 Jun 2020 22:02:56 -0400 Subject: [PATCH 103/145] memory_manager: Eliminate variable shadowing Renames some variables to prevent ones in inner scopes from shadowing outer-scoped variables. The Copy* functions have no shadowing, but we rename them anyways to remain consistent with the other functions. --- src/video_core/memory_manager.cpp | 40 +++++++++++++++++-------------- src/video_core/memory_manager.h | 12 +++++----- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index dbee9f6341..ff5505d124 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -210,10 +210,11 @@ bool MemoryManager::IsBlockContinuous(const GPUVAddr start, const std::size_t si return range == inner_size; } -void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const { +void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, + const std::size_t size) const { std::size_t remaining_size{size}; - std::size_t page_index{src_addr >> page_bits}; - std::size_t page_offset{src_addr & page_mask}; + std::size_t page_index{gpu_src_addr >> page_bits}; + std::size_t page_offset{gpu_src_addr & page_mask}; auto& memory = system.Memory(); @@ -234,11 +235,11 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s } } -void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, +void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, const std::size_t size) const { std::size_t remaining_size{size}; - std::size_t page_index{src_addr >> page_bits}; - std::size_t page_offset{src_addr & page_mask}; + std::size_t page_index{gpu_src_addr >> page_bits}; + std::size_t page_offset{gpu_src_addr & page_mask}; auto& memory = system.Memory(); @@ -259,10 +260,11 @@ void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, } } -void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) { +void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, + const std::size_t size) { std::size_t remaining_size{size}; - std::size_t page_index{dest_addr >> page_bits}; - std::size_t page_offset{dest_addr & page_mask}; + std::size_t page_index{gpu_dest_addr >> page_bits}; + std::size_t page_offset{gpu_dest_addr & page_mask}; auto& memory = system.Memory(); @@ -283,11 +285,11 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const } } -void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, +void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, const std::size_t size) { std::size_t remaining_size{size}; - std::size_t page_index{dest_addr >> page_bits}; - std::size_t page_offset{dest_addr & page_mask}; + std::size_t page_index{gpu_dest_addr >> page_bits}; + std::size_t page_offset{gpu_dest_addr & page_mask}; auto& memory = system.Memory(); @@ -306,16 +308,18 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, } } -void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) { +void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, + const std::size_t size) { std::vector tmp_buffer(size); - ReadBlock(src_addr, tmp_buffer.data(), size); - WriteBlock(dest_addr, tmp_buffer.data(), size); + ReadBlock(gpu_src_addr, tmp_buffer.data(), size); + WriteBlock(gpu_dest_addr, tmp_buffer.data(), size); } -void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) { +void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, + const std::size_t size) { std::vector tmp_buffer(size); - ReadBlockUnsafe(src_addr, tmp_buffer.data(), size); - WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size); + ReadBlockUnsafe(gpu_src_addr, tmp_buffer.data(), size); + WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size); } bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) { diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 0ddd52d5a1..87658e87af 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -79,9 +79,9 @@ public: * in the Host Memory counterpart. Note: This functions cause Host GPU Memory * Flushes and Invalidations, respectively to each operation. */ - void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const; - void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size); - void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size); + void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const; + void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); + void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size); /** * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and @@ -93,9 +93,9 @@ public: * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture * being flushed. */ - void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const; - void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, std::size_t size); - void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size); + void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const; + void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); + void CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size); /** * IsGranularRange checks if a gpu region can be simply read with a pointer From 8ea749c1ca50b3584df8c8d7019933fe80e31d9f Mon Sep 17 00:00:00 2001 From: Lioncash Date: Fri, 19 Jun 2020 22:10:43 -0400 Subject: [PATCH 104/145] macro_jit_x64: Remove unused variable Removes a completely unused label and marks another variable as unused, given it seems like it has potential uses in the future. --- src/video_core/macro/macro_jit_x64.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index bee34a7c0c..3515ea43b8 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -54,7 +54,7 @@ void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { const bool is_a_zero = opcode.src_a == 0; const bool is_b_zero = opcode.src_b == 0; const bool valid_operation = !is_a_zero && !is_b_zero; - const bool is_move_operation = !is_a_zero && is_b_zero; + [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero; const bool has_zero_register = is_a_zero || is_b_zero; const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry || opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow; @@ -73,7 +73,6 @@ void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { src_b = Compile_GetRegister(opcode.src_b, eax); } } - Xbyak::Label skip_carry{}; bool has_emitted = false; From 140f953b6a70fa2eaf3f2711993913f6f0ca7a75 Mon Sep 17 00:00:00 2001 From: Lioncash Date: Fri, 19 Jun 2020 22:33:01 -0400 Subject: [PATCH 105/145] macro_jit_x64: Correct readability of Compile_ExtractShiftLeftRegister() Previously dst wasn't being used. --- src/video_core/macro/macro_jit_x64.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index bee34a7c0c..1ecf1d27f8 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -259,8 +259,8 @@ void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { } void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { - auto dst = Compile_GetRegister(opcode.src_a, eax); - auto src = Compile_GetRegister(opcode.src_b, RESULT); + const auto dst = Compile_GetRegister(opcode.src_a, eax); + const auto src = Compile_GetRegister(opcode.src_b, RESULT); if (opcode.bf_src_bit != 0) { shr(src, opcode.bf_src_bit); @@ -269,7 +269,8 @@ void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { if (opcode.bf_size != 31) { and_(src, opcode.GetBitfieldMask()); } - shl(src, al); + shl(src, dst.cvt8()); + Compile_ProcessResult(opcode.result_operation, opcode.dst); } From 5a4e89b9018eec802ec445b5c7df7d270d35b4c1 Mon Sep 17 00:00:00 2001 From: Lioncash Date: Fri, 19 Jun 2020 22:56:34 -0400 Subject: [PATCH 106/145] macro_jit_x64: Correct readability of Compile_ExtractShiftLeftImmediate() Previously dst wasn't being used. --- src/video_core/macro/macro_jit_x64.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 1ecf1d27f8..202fbbc211 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -240,10 +240,10 @@ void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) { } void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { - auto dst = Compile_GetRegister(opcode.src_a, eax); - auto src = Compile_GetRegister(opcode.src_b, RESULT); + const auto dst = Compile_GetRegister(opcode.src_a, eax); + const auto src = Compile_GetRegister(opcode.src_b, RESULT); - shr(src, al); + shr(src, dst.cvt8()); if (opcode.bf_size != 0 && opcode.bf_size != 31) { and_(src, opcode.GetBitfieldMask()); } else if (opcode.bf_size == 0) { From a6e5b84d1fbfa976819645d8b7234d847756fc88 Mon Sep 17 00:00:00 2001 From: Lioncash Date: Fri, 19 Jun 2020 23:01:56 -0400 Subject: [PATCH 107/145] vulkan/wrapper: Remove noexcept from GetSurfaceCapabilitiesKHR() Check() can throw an exception if the Vulkan result isn't successful. We remove the check so that std::terminate isn't outright called and allows for better debugging (should it ever actually fail). --- src/video_core/renderer_vulkan/wrapper.cpp | 3 +-- src/video_core/renderer_vulkan/wrapper.h | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp index 2ce9b06264..42eff85d3a 100644 --- a/src/video_core/renderer_vulkan/wrapper.cpp +++ b/src/video_core/renderer_vulkan/wrapper.cpp @@ -725,8 +725,7 @@ bool PhysicalDevice::GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR s return supported == VK_TRUE; } -VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const - noexcept { +VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const { VkSurfaceCapabilitiesKHR capabilities; Check(dld->vkGetPhysicalDeviceSurfaceCapabilitiesKHR(physical_device, surface, &capabilities)); return capabilities; diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/renderer_vulkan/wrapper.h index 98937a77a6..da42ca88ee 100644 --- a/src/video_core/renderer_vulkan/wrapper.h +++ b/src/video_core/renderer_vulkan/wrapper.h @@ -779,7 +779,7 @@ public: bool GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR) const; - VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const noexcept; + VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const; std::vector GetSurfaceFormatsKHR(VkSurfaceKHR) const; From 5865a108859d6cf4cfdd92353f881cbedbb7b0f7 Mon Sep 17 00:00:00 2001 From: Lioncash Date: Fri, 19 Jun 2020 21:23:14 -0400 Subject: [PATCH 108/145] gl_arb_decompiler: Avoid several string copies Variables that are marked as const cannot have the move constructor invoked when returning from a function (the move constructor requires a non-const variable so it can "steal" the resources from it. --- .../renderer_opengl/gl_arb_decompiler.cpp | 63 +++++++++---------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp index 1e96b03108..eb5158407d 100644 --- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp @@ -281,14 +281,14 @@ private: template std::string Unary(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0])); return temporary; } template std::string Binary(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]), Visit(operation[1])); return temporary; @@ -296,7 +296,7 @@ private: template std::string Trinary(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]), Visit(operation[1]), Visit(operation[2])); return temporary; @@ -304,7 +304,7 @@ private: template std::string FloatComparison(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); AddLine("TRUNC.U.CC RC.x, {};", Binary(operation)); AddLine("MOV.S {}, 0;", temporary); AddLine("MOV.S {} (NE.x), -1;", temporary); @@ -331,7 +331,7 @@ private: template std::string HalfComparison(Operation operation) { - const std::string tmp1 = AllocVectorTemporary(); + std::string tmp1 = AllocVectorTemporary(); const std::string tmp2 = AllocVectorTemporary(); const std::string op_a = Visit(operation[0]); const std::string op_b = Visit(operation[1]); @@ -367,15 +367,14 @@ private: AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i])); } - const std::string result = coord; - AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, result, value, coord, + AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, coord, value, coord, image_id, ImageType(meta.image.type)); - return fmt::format("{}.x", result); + return fmt::format("{}.x", coord); } template std::string Atomic(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); std::string address; std::string_view opname; if (const auto gmem = std::get_if(&*operation[0])) { @@ -396,7 +395,7 @@ private: template std::string Negate(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); if constexpr (type == 'F') { AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0])); } else { @@ -407,7 +406,7 @@ private: template std::string Absolute(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0])); return temporary; } @@ -1156,20 +1155,20 @@ void ARBDecompiler::VisitAST(const ASTNode& node) { } std::string ARBDecompiler::VisitExpression(const Expr& node) { - const std::string result = AllocTemporary(); if (const auto expr = std::get_if(&*node)) { + std::string result = AllocTemporary(); AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1), VisitExpression(expr->operand2)); return result; } if (const auto expr = std::get_if(&*node)) { - const std::string result = AllocTemporary(); + std::string result = AllocTemporary(); AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1), VisitExpression(expr->operand2)); return result; } if (const auto expr = std::get_if(&*node)) { - const std::string result = AllocTemporary(); + std::string result = AllocTemporary(); AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1)); return result; } @@ -1186,7 +1185,7 @@ std::string ARBDecompiler::VisitExpression(const Expr& node) { return expr->value ? "0xffffffff" : "0"; } if (const auto expr = std::get_if(&*node)) { - const std::string result = AllocTemporary(); + std::string result = AllocTemporary(); AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value); return result; } @@ -1231,13 +1230,13 @@ std::string ARBDecompiler::Visit(const Node& node) { } if (const auto immediate = std::get_if(&*node)) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); AddLine("MOV.U {}, {};", temporary, immediate->GetValue()); return temporary; } if (const auto predicate = std::get_if(&*node)) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); switch (const auto index = predicate->GetIndex(); index) { case Tegra::Shader::Pred::UnusedIndex: AddLine("MOV.S {}, -1;", temporary); @@ -1333,13 +1332,13 @@ std::string ARBDecompiler::Visit(const Node& node) { } else { offset_string = Visit(offset); } - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string); return temporary; } if (const auto gmem = std::get_if(&*node)) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()), Visit(gmem->GetBaseAddress())); AddLine("LDB.U32 {}, {}[{}];", temporary, GlobalMemoryName(gmem->GetDescriptor()), @@ -1348,14 +1347,14 @@ std::string ARBDecompiler::Visit(const Node& node) { } if (const auto lmem = std::get_if(&*node)) { - const std::string temporary = Visit(lmem->GetAddress()); + std::string temporary = Visit(lmem->GetAddress()); AddLine("SHR.U {}, {}, 2;", temporary, temporary); AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary); return temporary; } if (const auto smem = std::get_if(&*node)) { - const std::string temporary = Visit(smem->GetAddress()); + std::string temporary = Visit(smem->GetAddress()); AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary); return temporary; } @@ -1535,7 +1534,7 @@ std::string ARBDecompiler::Assign(Operation operation) { } std::string ARBDecompiler::Select(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]), Visit(operation[2])); return temporary; @@ -1545,12 +1544,12 @@ std::string ARBDecompiler::FClamp(Operation operation) { // 1.0f in hex, replace with std::bit_cast on C++20 static constexpr u32 POSITIVE_ONE = 0x3f800000; - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); const Node& value = operation[0]; const Node& low = operation[1]; const Node& high = operation[2]; - const auto imm_low = std::get_if(&*low); - const auto imm_high = std::get_if(&*high); + const auto* const imm_low = std::get_if(&*low); + const auto* const imm_high = std::get_if(&*high); if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) { AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value)); } else { @@ -1574,7 +1573,7 @@ std::string ARBDecompiler::FCastHalf1(Operation operation) { } std::string ARBDecompiler::FSqrt(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0])); AddLine("RCP.F32 {}, {};", temporary, temporary); return temporary; @@ -1588,7 +1587,7 @@ std::string ARBDecompiler::FSwizzleAdd(Operation operation) { AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1])); return fmt::format("{}.x", temporary); } - const std::string lut = AllocVectorTemporary(); + AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage)); AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary); AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary); @@ -1766,21 +1765,21 @@ std::string ARBDecompiler::LogicalAssign(Operation operation) { } std::string ARBDecompiler::LogicalPick2(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); const u32 index = std::get(*operation[1]).GetValue(); AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index)); return temporary; } std::string ARBDecompiler::LogicalAnd2(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); const std::string op = Visit(operation[0]); AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op); return temporary; } std::string ARBDecompiler::FloatOrdered(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); AddLine("MOVC.F32 RC.x, {};", Visit(operation[0])); AddLine("MOVC.F32 RC.y, {};", Visit(operation[1])); AddLine("MOV.S {}, -1;", temporary); @@ -1790,7 +1789,7 @@ std::string ARBDecompiler::FloatOrdered(Operation operation) { } std::string ARBDecompiler::FloatUnordered(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); AddLine("MOVC.F32 RC.x, {};", Visit(operation[0])); AddLine("MOVC.F32 RC.y, {};", Visit(operation[1])); AddLine("MOV.S {}, 0;", temporary); @@ -1800,7 +1799,7 @@ std::string ARBDecompiler::FloatUnordered(Operation operation) { } std::string ARBDecompiler::LogicalAddCarry(Operation operation) { - const std::string temporary = AllocTemporary(); + std::string temporary = AllocTemporary(); AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1])); AddLine("MOV.S {}, 0;", temporary); AddLine("IF CF.x;"); From 480e1fa987ce427ce3208a49ae3f08494c417c5c Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Sun, 14 Jun 2020 00:02:42 -0400 Subject: [PATCH 109/145] decode/image: Implement B10G11R11F - Used by Kirby Star Allies --- src/video_core/shader/decode/image.cpp | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp index 60b6ad72a4..07778dc3e0 100644 --- a/src/video_core/shader/decode/image.cpp +++ b/src/video_core/shader/decode/image.cpp @@ -97,6 +97,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor, break; case TextureFormat::B5G6R5: case TextureFormat::B6G5R5: + case TextureFormat::BF10GF11RF11: if (component == 0) { return descriptor.b_type; } @@ -119,7 +120,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor, } break; } - UNIMPLEMENTED_MSG("texture format not implement={}", format); + UNIMPLEMENTED_MSG("Texture format not implemented={}", format); return ComponentType::FLOAT; } @@ -191,6 +192,14 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) { return 6; } return 0; + case TextureFormat::BF10GF11RF11: + if (component == 1 || component == 2) { + return 11; + } + if (component == 0) { + return 10; + } + return 0; case TextureFormat::G8R24: if (component == 0) { return 8; @@ -211,10 +220,9 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) { return (component == 0 || component == 1) ? 8 : 0; case TextureFormat::G4R4: return (component == 0 || component == 1) ? 4 : 0; - default: - UNIMPLEMENTED_MSG("texture format not implement={}", format); - return 0; } + UNIMPLEMENTED_MSG("Texture format not implemented={}", format); + return 0; } std::size_t GetImageComponentMask(TextureFormat format) { @@ -235,6 +243,7 @@ std::size_t GetImageComponentMask(TextureFormat format) { case TextureFormat::R32_B24G8: case TextureFormat::B5G6R5: case TextureFormat::B6G5R5: + case TextureFormat::BF10GF11RF11: return std::size_t{R | G | B}; case TextureFormat::R32_G32: case TextureFormat::R16_G16: @@ -248,10 +257,9 @@ std::size_t GetImageComponentMask(TextureFormat format) { case TextureFormat::R8: case TextureFormat::R1: return std::size_t{R}; - default: - UNIMPLEMENTED_MSG("texture format not implement={}", format); - return std::size_t{R | G | B | A}; } + UNIMPLEMENTED_MSG("Texture format not implemented={}", format); + return std::size_t{R | G | B | A}; } std::size_t GetImageTypeNumCoordinates(Tegra::Shader::ImageType image_type) { @@ -299,7 +307,7 @@ std::pair ShaderIR::GetComponentValue(ComponentType component_type, return {std::move(original_value), true}; } default: - UNIMPLEMENTED_MSG("Unimplement component type={}", component_type); + UNIMPLEMENTED_MSG("Unimplemented component type={}", component_type); return {std::move(original_value), true}; } } @@ -459,7 +467,7 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) { default: break; } - UNIMPLEMENTED_MSG("Unimplemented operation={} type={}", + UNIMPLEMENTED_MSG("Unimplemented operation={}, type={}", static_cast(instr.suatom_d.operation.Value()), static_cast(instr.suatom_d.operation_type.Value())); return OperationCode::AtomicImageAdd; From d6474b4aca7f054d00df350c716709475ef0f49b Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Sat, 16 May 2020 07:24:57 -0400 Subject: [PATCH 110/145] common/cpu_detect: Add AVX512 detection --- src/common/x64/cpu_detect.cpp | 5 +++++ src/common/x64/cpu_detect.h | 1 + 2 files changed, 6 insertions(+) diff --git a/src/common/x64/cpu_detect.cpp b/src/common/x64/cpu_detect.cpp index c9349a6b42..f35dcb498c 100644 --- a/src/common/x64/cpu_detect.cpp +++ b/src/common/x64/cpu_detect.cpp @@ -110,6 +110,11 @@ static CPUCaps Detect() { caps.bmi1 = true; if ((cpu_id[1] >> 8) & 1) caps.bmi2 = true; + // Checks for AVX512F, AVX512CD, AVX512VL, AVX512DQ, AVX512BW (Intel Skylake-X/SP) + if ((cpu_id[1] >> 16) & 1 && (cpu_id[1] >> 28) & 1 && (cpu_id[1] >> 31) & 1 && + (cpu_id[1] >> 17) & 1 && (cpu_id[1] >> 30) & 1) { + caps.avx512 = caps.avx2; + } } } diff --git a/src/common/x64/cpu_detect.h b/src/common/x64/cpu_detect.h index 20f2ba234a..7606c3f7b2 100644 --- a/src/common/x64/cpu_detect.h +++ b/src/common/x64/cpu_detect.h @@ -19,6 +19,7 @@ struct CPUCaps { bool lzcnt; bool avx; bool avx2; + bool avx512; bool bmi1; bool bmi2; bool fma; From 97ba520434cceb42af3b17a59c731dd734e9108f Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Sat, 16 May 2020 07:25:13 -0400 Subject: [PATCH 111/145] common/telemetry: Add AVX512 to telemetry --- src/common/telemetry.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/telemetry.cpp b/src/common/telemetry.cpp index 200c6489aa..16d42facde 100644 --- a/src/common/telemetry.cpp +++ b/src/common/telemetry.cpp @@ -60,6 +60,7 @@ void AppendCPUInfo(FieldCollection& fc) { fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AES", Common::GetCPUCaps().aes); fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX", Common::GetCPUCaps().avx); fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX2", Common::GetCPUCaps().avx2); + fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX512", Common::GetCPUCaps().avx512); fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_BMI1", Common::GetCPUCaps().bmi1); fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_BMI2", Common::GetCPUCaps().bmi2); fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_FMA", Common::GetCPUCaps().fma); From 9bb5bf0b2bf62207c117f05aa76d0540b6e106fd Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Sat, 16 May 2020 07:35:15 -0400 Subject: [PATCH 112/145] main: Append AVX and FMA instructions to cpu string Append AVX and FMA instructions to cpu string if the host cpu supports them --- src/yuzu/main.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index 4119d79077..059b96e70e 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -217,7 +217,20 @@ GMainWindow::GMainWindow() LOG_INFO(Frontend, "yuzu Version: {} | {}-{}", yuzu_build_version, Common::g_scm_branch, Common::g_scm_desc); #ifdef ARCHITECTURE_x86_64 - LOG_INFO(Frontend, "Host CPU: {}", Common::GetCPUCaps().cpu_string); + const auto& caps = Common::GetCPUCaps(); + std::string cpu_string = caps.cpu_string; + if (caps.avx || caps.avx2 || caps.avx512) { + cpu_string += " | AVX"; + if (caps.avx512) { + cpu_string += "512"; + } else if (caps.avx2) { + cpu_string += '2'; + } + if (caps.fma || caps.fma4) { + cpu_string += " | FMA"; + } + } + LOG_INFO(Frontend, "Host CPU: {}", cpu_string); #endif LOG_INFO(Frontend, "Host OS: {}", QSysInfo::prettyProductName().toStdString()); LOG_INFO(Frontend, "Host RAM: {:.2f} GB", From a5ed0c3df7218f922f98d4bd1fed1214ec728869 Mon Sep 17 00:00:00 2001 From: Lioncash Date: Sat, 20 Jun 2020 01:06:05 -0400 Subject: [PATCH 113/145] software_keyboard: Eliminate trivial redundant copies We can just make use of moves here to get rid of two redundant copies --- src/core/hle/service/am/applets/software_keyboard.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/hle/service/am/applets/software_keyboard.cpp b/src/core/hle/service/am/applets/software_keyboard.cpp index 54e63c138b..d14076b02c 100644 --- a/src/core/hle/service/am/applets/software_keyboard.cpp +++ b/src/core/hle/service/am/applets/software_keyboard.cpp @@ -30,7 +30,7 @@ static Core::Frontend::SoftwareKeyboardParameters ConvertToFrontendParameters( config.sub_text.size()); params.guide_text = Common::UTF16StringFromFixedZeroTerminatedBuffer(config.guide_text.data(), config.guide_text.size()); - params.initial_text = initial_text; + params.initial_text = std::move(initial_text); params.max_length = config.length_limit == 0 ? DEFAULT_MAX_LENGTH : config.length_limit; params.password = static_cast(config.is_password); params.cursor_at_beginning = static_cast(config.initial_cursor_position); @@ -109,7 +109,7 @@ void SoftwareKeyboard::Execute() { const auto parameters = ConvertToFrontendParameters(config, initial_text); - frontend.RequestText([this](std::optional text) { WriteText(text); }, + frontend.RequestText([this](std::optional text) { WriteText(std::move(text)); }, parameters); } From ef53b2fd08f1122f22456500bfdc707f1c18906c Mon Sep 17 00:00:00 2001 From: Lioncash Date: Fri, 19 Jun 2020 23:13:48 -0400 Subject: [PATCH 114/145] texture_cache: Fix incorrect address used in a DeduceSurface() call Previously the source was being deduced twice in a row. --- src/video_core/texture_cache/texture_cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index b543fc8c03..85075e868c 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -1053,7 +1053,7 @@ private: void DeduceBestBlit(SurfaceParams& src_params, SurfaceParams& dst_params, const GPUVAddr src_gpu_addr, const GPUVAddr dst_gpu_addr) { auto deduced_src = DeduceSurface(src_gpu_addr, src_params); - auto deduced_dst = DeduceSurface(src_gpu_addr, src_params); + auto deduced_dst = DeduceSurface(dst_gpu_addr, dst_params); if (deduced_src.Failed() || deduced_dst.Failed()) { return; } From c12eb814b41b5b354df2548d5d48b9ae529ad4b8 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Sat, 20 Jun 2020 22:23:58 +0100 Subject: [PATCH 115/145] macro_jit_x64: Use ecx for shift register shl/shr only accept cl as their second argument --- src/video_core/macro/macro_jit_x64.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 4eef342ecf..389b58989c 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -239,7 +239,7 @@ void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) { } void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { - const auto dst = Compile_GetRegister(opcode.src_a, eax); + const auto dst = Compile_GetRegister(opcode.src_a, ecx); const auto src = Compile_GetRegister(opcode.src_b, RESULT); shr(src, dst.cvt8()); @@ -258,7 +258,7 @@ void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { } void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { - const auto dst = Compile_GetRegister(opcode.src_a, eax); + const auto dst = Compile_GetRegister(opcode.src_a, ecx); const auto src = Compile_GetRegister(opcode.src_b, RESULT); if (opcode.bf_src_bit != 0) { From a8674a7b8618b1f17b7abb8908ea7cd4367aa730 Mon Sep 17 00:00:00 2001 From: FearlessTobi Date: Sun, 21 Jun 2020 03:11:23 +0200 Subject: [PATCH 116/145] Fix: fatal error CVT1100 when compiling manifest file Occurs when doing a local compile in MSVC build. The compiler I'm using is as below: Microsoft Visual Studio Community 2019 Preview Version 16.6.0 Preview 5.0 Fixes this error: CVTRES : fatal error CVT1100: duplicate resource. type:MANIFEST, name:1, language:0x0409 LINK : fatal error LNK1123: failure during conversion to COFF: file invalid or corrupt I have put 0 since previous name was 1. If have other names in mind, please let me know. Co-Authored-By: dragios --- src/yuzu/yuzu.rc | 2 +- src/yuzu_cmd/yuzu.rc | 2 +- src/yuzu_tester/yuzu.rc | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/yuzu/yuzu.rc b/src/yuzu/yuzu.rc index 1b253653fb..4a3645a719 100644 --- a/src/yuzu/yuzu.rc +++ b/src/yuzu/yuzu.rc @@ -16,4 +16,4 @@ IDI_ICON1 ICON "../../dist/yuzu.ico" // RT_MANIFEST // -1 RT_MANIFEST "../../dist/yuzu.manifest" +0 RT_MANIFEST "../../dist/yuzu.manifest" diff --git a/src/yuzu_cmd/yuzu.rc b/src/yuzu_cmd/yuzu.rc index 7de8ef3d90..0cde75e2f8 100644 --- a/src/yuzu_cmd/yuzu.rc +++ b/src/yuzu_cmd/yuzu.rc @@ -14,4 +14,4 @@ YUZU_ICON ICON "../../dist/yuzu.ico" // RT_MANIFEST // -1 RT_MANIFEST "../../dist/yuzu.manifest" +0 RT_MANIFEST "../../dist/yuzu.manifest" diff --git a/src/yuzu_tester/yuzu.rc b/src/yuzu_tester/yuzu.rc index 7de8ef3d90..0cde75e2f8 100644 --- a/src/yuzu_tester/yuzu.rc +++ b/src/yuzu_tester/yuzu.rc @@ -14,4 +14,4 @@ YUZU_ICON ICON "../../dist/yuzu.ico" // RT_MANIFEST // -1 RT_MANIFEST "../../dist/yuzu.manifest" +0 RT_MANIFEST "../../dist/yuzu.manifest" From 20ed33b53b29ef29c0c9578a6ac630bb81717b80 Mon Sep 17 00:00:00 2001 From: FearlessTobi Date: Sun, 21 Jun 2020 03:16:12 +0200 Subject: [PATCH 117/145] Update manifest file to include new elements that are introduced with Windows 10 later versions Co-Authored-By: dragios --- dist/yuzu.manifest | 80 +++++++++++++++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 23 deletions(-) diff --git a/dist/yuzu.manifest b/dist/yuzu.manifest index fd30b656ff..038edff230 100644 --- a/dist/yuzu.manifest +++ b/dist/yuzu.manifest @@ -1,24 +1,58 @@ - - - - - - - - - - - True/PM - true - - - - - - - - - - - \ No newline at end of file + + + + + + true/pm + + + + PerMonitorV2 + + + + true + + + true + + + + + + + + + + + + + + + + + + + + + + + + From 1e65da971bf6edd5611e6e409ba1cc4f99e58655 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Sat, 20 Jun 2020 07:41:55 -0400 Subject: [PATCH 118/145] gl_device: Check for GL_EXT_texture_shadow_lod --- src/video_core/renderer_opengl/gl_device.cpp | 2 ++ src/video_core/renderer_opengl/gl_device.h | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index b31d604e42..1011c7738e 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -216,6 +216,7 @@ Device::Device() has_shader_ballot = GLAD_GL_ARB_shader_ballot; has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array; has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted"); + has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod"); has_astc = IsASTCSupported(); has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = is_amd; @@ -245,6 +246,7 @@ Device::Device(std::nullptr_t) { has_shader_ballot = true; has_vertex_viewport_layer = true; has_image_load_formatted = true; + has_texture_shadow_lod = true; has_variable_aoffi = true; } diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 145347943a..c86e709b1f 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -68,6 +68,10 @@ public: return has_image_load_formatted; } + bool HasTextureShadowLod() const { + return has_texture_shadow_lod; + } + bool HasASTC() const { return has_astc; } @@ -110,6 +114,7 @@ private: bool has_shader_ballot{}; bool has_vertex_viewport_layer{}; bool has_image_load_formatted{}; + bool has_texture_shadow_lod{}; bool has_astc{}; bool has_variable_aoffi{}; bool has_component_indexing_bug{}; From f77c897b8d5287adb64e08f65e494dac45033de3 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Sat, 20 Jun 2020 07:43:04 -0400 Subject: [PATCH 119/145] gl_shader_decompiler: Enable GL_EXT_texture_shadow_lod if available Enable GL_EXT_texture_shadow_lod if available. If this extension is not available, such as on Intel/AMD proprietary drivers, use textureGrad as a workaround. --- .../renderer_opengl/gl_shader_decompiler.cpp | 50 ++++++++++++++++--- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index d6e30b321f..2c49aeaacc 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode; using Tegra::Shader::IpaSampleMode; using Tegra::Shader::PixelImap; using Tegra::Shader::Register; +using Tegra::Shader::TextureType; using VideoCommon::Shader::BuildTransformFeedback; using VideoCommon::Shader::Registry; @@ -526,6 +527,9 @@ private: if (device.HasImageLoadFormatted()) { code.AddLine("#extension GL_EXT_shader_image_load_formatted : require"); } + if (device.HasTextureShadowLod()) { + code.AddLine("#extension GL_EXT_texture_shadow_lod : require"); + } if (device.HasWarpIntrinsics()) { code.AddLine("#extension GL_NV_gpu_shader5 : require"); code.AddLine("#extension GL_NV_shader_thread_group : require"); @@ -909,13 +913,13 @@ private: return "samplerBuffer"; } switch (sampler.type) { - case Tegra::Shader::TextureType::Texture1D: + case TextureType::Texture1D: return "sampler1D"; - case Tegra::Shader::TextureType::Texture2D: + case TextureType::Texture2D: return "sampler2D"; - case Tegra::Shader::TextureType::Texture3D: + case TextureType::Texture3D: return "sampler3D"; - case Tegra::Shader::TextureType::TextureCube: + case TextureType::TextureCube: return "samplerCube"; default: UNREACHABLE(); @@ -1380,8 +1384,19 @@ private: const std::size_t count = operation.GetOperandsCount(); const bool has_array = meta->sampler.is_array; const bool has_shadow = meta->sampler.is_shadow; + const bool workaround_lod_array_shadow_as_grad = + !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow && + ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) || + meta->sampler.type == TextureType::TextureCube); + + std::string expr = "texture"; + + if (workaround_lod_array_shadow_as_grad) { + expr += "Grad"; + } else { + expr += function_suffix; + } - std::string expr = "texture" + function_suffix; if (!meta->aoffi.empty()) { expr += "Offset"; } else if (!meta->ptp.empty()) { @@ -1415,6 +1430,16 @@ private: expr += ')'; } + if (workaround_lod_array_shadow_as_grad) { + switch (meta->sampler.type) { + case TextureType::Texture2D: + return expr + ", vec2(0.0), vec2(0.0))"; + case TextureType::TextureCube: + return expr + ", vec3(0.0), vec3(0.0))"; + } + UNREACHABLE(); + } + for (const auto& variant : extras) { if (const auto argument = std::get_if(&variant)) { expr += GenerateTextureArgument(*argument); @@ -2041,8 +2066,19 @@ private: const auto meta = std::get_if(&operation.GetMeta()); ASSERT(meta); - std::string expr = GenerateTexture( - operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}}); + std::string expr{}; + + if (!device.HasTextureShadowLod() && meta->sampler.is_shadow && + ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) || + meta->sampler.type == TextureType::TextureCube)) { + LOG_ERROR(Render_OpenGL, + "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround"); + expr = GenerateTexture(operation, "Lod", {}); + } else { + expr = GenerateTexture(operation, "Lod", + {TextureArgument{Type::Float, meta->lod}, TextureOffset{}}); + } + if (meta->sampler.is_shadow) { expr = "vec4(" + expr + ')'; } From b81af6ae9bdbf7edcdc6c1ec4f68d5e29ad18c5f Mon Sep 17 00:00:00 2001 From: VolcaEM <63682805+VolcaEM@users.noreply.github.com> Date: Sun, 21 Jun 2020 06:09:28 +0200 Subject: [PATCH 120/145] Add a "Open Mods Page" button to the GUI --- src/yuzu/main.cpp | 13 +++++++++++++ src/yuzu/main.h | 1 + src/yuzu/main.ui | 6 ++++++ 3 files changed, 20 insertions(+) diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index 270cccc772..826f8903be 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -57,6 +57,7 @@ static FileSys::VirtualFile VfsDirectoryCreateFileWrapper(const FileSys::Virtual #include #include #include +#include #include #include "common/common_paths.h" @@ -826,6 +827,7 @@ void GMainWindow::ConnectMenuEvents() { connect(ui.action_Stop, &QAction::triggered, this, &GMainWindow::OnStopGame); connect(ui.action_Report_Compatibility, &QAction::triggered, this, &GMainWindow::OnMenuReportCompatibility); + connect(ui.action_Open_Mods_Page, &QAction::triggered, this, &GMainWindow::OnSwitchModsPage); connect(ui.action_Restart, &QAction::triggered, this, [this] { BootGame(QString(game_path)); }); connect(ui.action_Configure, &QAction::triggered, this, &GMainWindow::OnConfigure); @@ -1797,6 +1799,17 @@ void GMainWindow::OnMenuReportCompatibility() { } } +void GMainWindow::OnSwitchModsPage() { + const std::string mods_page_url = "https://github.com/yuzu-emu/yuzu/wiki/Switch-Mods"; + const QString mods_page_url_qs = QString::fromStdString(mods_page_url); + const QUrl mods_page(mods_page_url_qs); + const bool open = QDesktopServices::openUrl(mods_page); + if (!open) { + QMessageBox::warning(this, tr("Error opening URL"), + tr("Unable to open the URL \"%1\".").arg(mods_page_url_qs)); + } +} + void GMainWindow::ToggleFullscreen() { if (!emulation_running) { return; diff --git a/src/yuzu/main.h b/src/yuzu/main.h index 4f4c8ddbee..7d99106080 100644 --- a/src/yuzu/main.h +++ b/src/yuzu/main.h @@ -181,6 +181,7 @@ private slots: void OnPauseGame(); void OnStopGame(); void OnMenuReportCompatibility(); + void OnSwitchModsPage(); /// Called whenever a user selects a game in the game list widget. void OnGameListLoadFile(QString game_path); void OnGameListOpenFolder(GameListOpenTarget target, const std::string& game_path); diff --git a/src/yuzu/main.ui b/src/yuzu/main.ui index 97c90f50bb..b5745dfd56 100644 --- a/src/yuzu/main.ui +++ b/src/yuzu/main.ui @@ -113,6 +113,7 @@ &Help + @@ -256,6 +257,11 @@ false + + + Open Mods Page + + Open yuzu Folder From 606e833d26a279dc1621b48b21994f1a6de3bf17 Mon Sep 17 00:00:00 2001 From: VolcaEM <63682805+VolcaEM@users.noreply.github.com> Date: Sun, 21 Jun 2020 06:12:23 +0200 Subject: [PATCH 121/145] Address review comment by Lioncash Co-authored-by: LC --- src/yuzu/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index 826f8903be..0ce7cd62a4 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -1800,7 +1800,7 @@ void GMainWindow::OnMenuReportCompatibility() { } void GMainWindow::OnSwitchModsPage() { - const std::string mods_page_url = "https://github.com/yuzu-emu/yuzu/wiki/Switch-Mods"; + const auto mods_page_url = QStringLiteral("https://github.com/yuzu-emu/yuzu/wiki/Switch-Mods"); const QString mods_page_url_qs = QString::fromStdString(mods_page_url); const QUrl mods_page(mods_page_url_qs); const bool open = QDesktopServices::openUrl(mods_page); From d11b04ed46b2c6720e9bb222e98482097274a1f5 Mon Sep 17 00:00:00 2001 From: VolcaEM <63682805+VolcaEM@users.noreply.github.com> Date: Sun, 21 Jun 2020 06:16:03 +0200 Subject: [PATCH 122/145] Remove unnecessary conversion --- src/yuzu/main.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index 0ce7cd62a4..6e6e5dffef 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -1801,12 +1801,11 @@ void GMainWindow::OnMenuReportCompatibility() { void GMainWindow::OnSwitchModsPage() { const auto mods_page_url = QStringLiteral("https://github.com/yuzu-emu/yuzu/wiki/Switch-Mods"); - const QString mods_page_url_qs = QString::fromStdString(mods_page_url); - const QUrl mods_page(mods_page_url_qs); + const QUrl mods_page(mods_page_url); const bool open = QDesktopServices::openUrl(mods_page); if (!open) { QMessageBox::warning(this, tr("Error opening URL"), - tr("Unable to open the URL \"%1\".").arg(mods_page_url_qs)); + tr("Unable to open the URL \"%1\".").arg(mods_page_url)); } } From 23d57ed4f733731206abcd4d9db7d8bdb34208e7 Mon Sep 17 00:00:00 2001 From: VolcaEM <63682805+VolcaEM@users.noreply.github.com> Date: Sun, 21 Jun 2020 06:17:46 +0200 Subject: [PATCH 123/145] Clang-format --- src/yuzu/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index 6e6e5dffef..e53c65f81f 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -56,8 +56,8 @@ static FileSys::VirtualFile VfsDirectoryCreateFileWrapper(const FileSys::Virtual #include #include #include -#include #include +#include #include #include "common/common_paths.h" From 182ac8a504233143fd2f78ef0dc4fb94331cff85 Mon Sep 17 00:00:00 2001 From: VolcaEM <63682805+VolcaEM@users.noreply.github.com> Date: Sun, 21 Jun 2020 18:09:14 +0200 Subject: [PATCH 124/145] Correct function name (1/2) --- src/yuzu/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/yuzu/main.h b/src/yuzu/main.h index 7d99106080..d55e55cc66 100644 --- a/src/yuzu/main.h +++ b/src/yuzu/main.h @@ -181,7 +181,7 @@ private slots: void OnPauseGame(); void OnStopGame(); void OnMenuReportCompatibility(); - void OnSwitchModsPage(); + void OnOpenModsPage(); /// Called whenever a user selects a game in the game list widget. void OnGameListLoadFile(QString game_path); void OnGameListOpenFolder(GameListOpenTarget target, const std::string& game_path); From 409fedaf974f6c6add862a518c8fa5e7c8a341b1 Mon Sep 17 00:00:00 2001 From: VolcaEM <63682805+VolcaEM@users.noreply.github.com> Date: Sun, 21 Jun 2020 18:10:23 +0200 Subject: [PATCH 125/145] Correct function name (2/2) --- src/yuzu/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index e53c65f81f..cb36bb8692 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -827,7 +827,7 @@ void GMainWindow::ConnectMenuEvents() { connect(ui.action_Stop, &QAction::triggered, this, &GMainWindow::OnStopGame); connect(ui.action_Report_Compatibility, &QAction::triggered, this, &GMainWindow::OnMenuReportCompatibility); - connect(ui.action_Open_Mods_Page, &QAction::triggered, this, &GMainWindow::OnSwitchModsPage); + connect(ui.action_Open_Mods_Page, &QAction::triggered, this, &GMainWindow::OnOpenModsPage); connect(ui.action_Restart, &QAction::triggered, this, [this] { BootGame(QString(game_path)); }); connect(ui.action_Configure, &QAction::triggered, this, &GMainWindow::OnConfigure); @@ -1799,7 +1799,7 @@ void GMainWindow::OnMenuReportCompatibility() { } } -void GMainWindow::OnSwitchModsPage() { +void GMainWindow::OnOpenModsPage() { const auto mods_page_url = QStringLiteral("https://github.com/yuzu-emu/yuzu/wiki/Switch-Mods"); const QUrl mods_page(mods_page_url); const bool open = QDesktopServices::openUrl(mods_page); From 0235915baaca8c0928a54fae1df60595e451e223 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Sun, 21 Jun 2020 15:48:02 -0400 Subject: [PATCH 126/145] hid: Implement Get/ResetGyroscopeZeroDriftMode - Used by Captain Toad Treasure Tracker --- src/core/hle/service/hid/controllers/npad.cpp | 8 ++++ src/core/hle/service/hid/controllers/npad.h | 10 ++++- src/core/hle/service/hid/hid.cpp | 42 ++++++++++++++++--- src/core/hle/service/hid/hid.h | 2 + 4 files changed, 56 insertions(+), 6 deletions(-) diff --git a/src/core/hle/service/hid/controllers/npad.cpp b/src/core/hle/service/hid/controllers/npad.cpp index c55d900e27..6fbee7efa3 100644 --- a/src/core/hle/service/hid/controllers/npad.cpp +++ b/src/core/hle/service/hid/controllers/npad.cpp @@ -566,6 +566,14 @@ void Controller_NPad::DisconnectNPad(u32 npad_id) { connected_controllers[NPadIdToIndex(npad_id)].is_connected = false; } +void Controller_NPad::SetGyroscopeZeroDriftMode(GyroscopeZeroDriftMode drift_mode) { + gyroscope_zero_drift_mode = drift_mode; +} + +Controller_NPad::GyroscopeZeroDriftMode Controller_NPad::GetGyroscopeZeroDriftMode() const { + return gyroscope_zero_drift_mode; +} + void Controller_NPad::StartLRAssignmentMode() { // Nothing internally is used for lr assignment mode. Since we have the ability to set the // controller types from boot, it doesn't really matter about showing a selection screen diff --git a/src/core/hle/service/hid/controllers/npad.h b/src/core/hle/service/hid/controllers/npad.h index 931f03430e..5d4c58a431 100644 --- a/src/core/hle/service/hid/controllers/npad.h +++ b/src/core/hle/service/hid/controllers/npad.h @@ -58,6 +58,12 @@ public: }; static_assert(sizeof(Vibration) == 0x10, "Vibration is an invalid size"); + enum class GyroscopeZeroDriftMode : u32 { + Loose = 0, + Standard = 1, + Tight = 2, + }; + enum class NpadHoldType : u64 { Vertical = 0, Horizontal = 1, @@ -117,6 +123,8 @@ public: void ConnectNPad(u32 npad_id); void DisconnectNPad(u32 npad_id); + void SetGyroscopeZeroDriftMode(GyroscopeZeroDriftMode drift_mode); + GyroscopeZeroDriftMode GetGyroscopeZeroDriftMode() const; LedPattern GetLedPattern(u32 npad_id); void SetVibrationEnabled(bool can_vibrate); bool IsVibrationEnabled() const; @@ -324,8 +332,8 @@ private: std::array styleset_changed_events; Vibration last_processed_vibration{}; std::array connected_controllers{}; + GyroscopeZeroDriftMode gyroscope_zero_drift_mode{GyroscopeZeroDriftMode::Standard}; bool can_controllers_vibrate{true}; - std::array npad_pad_states{}; bool is_in_lr_assignment_mode{false}; Core::System& system; diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp index 72a050de25..415c2829ed 100644 --- a/src/core/hle/service/hid/hid.cpp +++ b/src/core/hle/service/hid/hid.cpp @@ -185,8 +185,8 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) { {77, nullptr, "GetAccelerometerPlayMode"}, {78, nullptr, "ResetAccelerometerPlayMode"}, {79, &Hid::SetGyroscopeZeroDriftMode, "SetGyroscopeZeroDriftMode"}, - {80, nullptr, "GetGyroscopeZeroDriftMode"}, - {81, nullptr, "ResetGyroscopeZeroDriftMode"}, + {80, &Hid::GetGyroscopeZeroDriftMode, "GetGyroscopeZeroDriftMode"}, + {81, &Hid::ResetGyroscopeZeroDriftMode, "ResetGyroscopeZeroDriftMode"}, {82, &Hid::IsSixAxisSensorAtRest, "IsSixAxisSensorAtRest"}, {83, nullptr, "IsFirmwareUpdateAvailableForSixAxisSensor"}, {91, &Hid::ActivateGesture, "ActivateGesture"}, @@ -419,9 +419,41 @@ void Hid::SetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx) { const auto drift_mode{rp.Pop()}; const auto applet_resource_user_id{rp.Pop()}; - LOG_WARNING(Service_HID, - "(STUBBED) called, handle={}, drift_mode={}, applet_resource_user_id={}", handle, - drift_mode, applet_resource_user_id); + applet_resource->GetController(HidController::NPad) + .SetGyroscopeZeroDriftMode(Controller_NPad::GyroscopeZeroDriftMode{drift_mode}); + + LOG_DEBUG(Service_HID, "called, handle={}, drift_mode={}, applet_resource_user_id={}", handle, + drift_mode, applet_resource_user_id); + + IPC::ResponseBuilder rb{ctx, 2}; + rb.Push(RESULT_SUCCESS); +} + +void Hid::GetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx) { + IPC::RequestParser rp{ctx}; + const auto handle{rp.Pop()}; + const auto applet_resource_user_id{rp.Pop()}; + + LOG_DEBUG(Service_HID, "called, handle={}, applet_resource_user_id={}", handle, + applet_resource_user_id); + + IPC::ResponseBuilder rb{ctx, 3}; + rb.Push(RESULT_SUCCESS); + rb.Push( + static_cast(applet_resource->GetController(HidController::NPad) + .GetGyroscopeZeroDriftMode())); +} + +void Hid::ResetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx) { + IPC::RequestParser rp{ctx}; + const auto handle{rp.Pop()}; + const auto applet_resource_user_id{rp.Pop()}; + + applet_resource->GetController(HidController::NPad) + .SetGyroscopeZeroDriftMode(Controller_NPad::GyroscopeZeroDriftMode::Standard); + + LOG_DEBUG(Service_HID, "called, handle={}, applet_resource_user_id={}", handle, + applet_resource_user_id); IPC::ResponseBuilder rb{ctx, 2}; rb.Push(RESULT_SUCCESS); diff --git a/src/core/hle/service/hid/hid.h b/src/core/hle/service/hid/hid.h index d481a75f85..41b330fa93 100644 --- a/src/core/hle/service/hid/hid.h +++ b/src/core/hle/service/hid/hid.h @@ -95,6 +95,8 @@ private: void ActivateNpadWithRevision(Kernel::HLERequestContext& ctx); void StartSixAxisSensor(Kernel::HLERequestContext& ctx); void SetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx); + void GetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx); + void ResetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx); void IsSixAxisSensorAtRest(Kernel::HLERequestContext& ctx); void SetSupportedNpadStyleSet(Kernel::HLERequestContext& ctx); void GetSupportedNpadStyleSet(Kernel::HLERequestContext& ctx); From e0af4cdf9871e42cb3aaef950b3505b5af6b679e Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Mon, 22 Jun 2020 06:59:41 -0400 Subject: [PATCH 127/145] arm_dynarmic_32: Log under Core_ARM instead of HW_GPU --- src/core/arm/dynarmic/arm_dynarmic_32.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp index 19d798dc70..4c8663d03c 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp @@ -62,7 +62,7 @@ public: case Dynarmic::A32::Exception::Breakpoint: break; } - LOG_CRITICAL(HW_GPU, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})", + LOG_CRITICAL(Core_ARM, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})", static_cast(exception), pc, MemoryReadCode(pc)); UNIMPLEMENTED(); } From f2df941e8dbecbb0c3240039f637ae749f633eb6 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Mon, 22 Jun 2020 07:00:24 -0400 Subject: [PATCH 128/145] arm_dynarmic_64: Log the instruction when an exception is raised --- src/core/arm/dynarmic/arm_dynarmic_64.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.cpp b/src/core/arm/dynarmic/arm_dynarmic_64.cpp index 337b97be94..5f5e36d94b 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp @@ -98,8 +98,8 @@ public: } [[fallthrough]]; default: - ASSERT_MSG(false, "ExceptionRaised(exception = {}, pc = {:X})", - static_cast(exception), pc); + ASSERT_MSG(false, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})", + static_cast(exception), pc, MemoryReadCode(pc)); } } From e193aa3f53a09850606145466d6caa9abd3fb856 Mon Sep 17 00:00:00 2001 From: VolcaEM <63682805+VolcaEM@users.noreply.github.com> Date: Mon, 22 Jun 2020 22:03:26 +0200 Subject: [PATCH 129/145] account: Update function tables and add missing classes (#4145) * account: Update function tables and add missing classes * clang-format * Add missing "public" * Add missing public again * Add missing final --- src/core/hle/service/acc/acc.cpp | 341 ++++++++++++++++++++++++++++ src/core/hle/service/acc/acc_aa.cpp | 4 +- src/core/hle/service/acc/acc_su.cpp | 34 +-- src/core/hle/service/acc/acc_u0.cpp | 18 +- src/core/hle/service/acc/acc_u1.cpp | 29 +-- 5 files changed, 384 insertions(+), 42 deletions(-) diff --git a/src/core/hle/service/acc/acc.cpp b/src/core/hle/service/acc/acc.cpp index 630a8b048d..94d8c1fc6e 100644 --- a/src/core/hle/service/acc/acc.cpp +++ b/src/core/hle/service/acc/acc.cpp @@ -44,6 +44,218 @@ static constexpr u32 SanitizeJPEGSize(std::size_t size) { return static_cast(std::min(size, max_jpeg_image_size)); } +class IManagerForSystemService final : public ServiceFramework { +public: + explicit IManagerForSystemService(Common::UUID user_id) + : ServiceFramework("IManagerForSystemService") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "CheckAvailability"}, + {1, nullptr, "GetAccountId"}, + {2, nullptr, "EnsureIdTokenCacheAsync"}, + {3, nullptr, "LoadIdTokenCache"}, + {100, nullptr, "SetSystemProgramIdentification"}, + {101, nullptr, "RefreshNotificationTokenAsync"}, // 7.0.0+ + {110, nullptr, "GetServiceEntryRequirementCache"}, // 4.0.0+ + {111, nullptr, "InvalidateServiceEntryRequirementCache"}, // 4.0.0+ + {112, nullptr, "InvalidateTokenCache"}, // 4.0.0 - 6.2.0 + {113, nullptr, "GetServiceEntryRequirementCacheForOnlinePlay"}, // 6.1.0+ + {120, nullptr, "GetNintendoAccountId"}, + {121, nullptr, "CalculateNintendoAccountAuthenticationFingerprint"}, // 9.0.0+ + {130, nullptr, "GetNintendoAccountUserResourceCache"}, + {131, nullptr, "RefreshNintendoAccountUserResourceCacheAsync"}, + {132, nullptr, "RefreshNintendoAccountUserResourceCacheAsyncIfSecondsElapsed"}, + {133, nullptr, "GetNintendoAccountVerificationUrlCache"}, // 9.0.0+ + {134, nullptr, "RefreshNintendoAccountVerificationUrlCache"}, // 9.0.0+ + {135, nullptr, "RefreshNintendoAccountVerificationUrlCacheAsyncIfSecondsElapsed"}, // 9.0.0+ + {140, nullptr, "GetNetworkServiceLicenseCache"}, // 5.0.0+ + {141, nullptr, "RefreshNetworkServiceLicenseCacheAsync"}, // 5.0.0+ + {142, nullptr, "RefreshNetworkServiceLicenseCacheAsyncIfSecondsElapsed"}, // 5.0.0+ + {150, nullptr, "CreateAuthorizationRequest"}, + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + +// 3.0.0+ +class IFloatingRegistrationRequest final : public ServiceFramework { +public: + explicit IFloatingRegistrationRequest(Common::UUID user_id) + : ServiceFramework("IFloatingRegistrationRequest") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "GetSessionId"}, + {12, nullptr, "GetAccountId"}, + {13, nullptr, "GetLinkedNintendoAccountId"}, + {14, nullptr, "GetNickname"}, + {15, nullptr, "GetProfileImage"}, + {21, nullptr, "LoadIdTokenCache"}, + {100, nullptr, "RegisterUser"}, // [1.0.0-3.0.2] RegisterAsync + {101, nullptr, "RegisterUserWithUid"}, // [1.0.0-3.0.2] RegisterWithUidAsync + {102, nullptr, "RegisterNetworkServiceAccountAsync"}, // 4.0.0+ + {103, nullptr, "RegisterNetworkServiceAccountWithUidAsync"}, // 4.0.0+ + {110, nullptr, "SetSystemProgramIdentification"}, + {111, nullptr, "EnsureIdTokenCacheAsync"}, + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + +class IAdministrator final : public ServiceFramework { +public: + explicit IAdministrator(Common::UUID user_id) : ServiceFramework("IAdministrator") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "CheckAvailability"}, + {1, nullptr, "GetAccountId"}, + {2, nullptr, "EnsureIdTokenCacheAsync"}, + {3, nullptr, "LoadIdTokenCache"}, + {100, nullptr, "SetSystemProgramIdentification"}, + {101, nullptr, "RefreshNotificationTokenAsync"}, // 7.0.0+ + {110, nullptr, "GetServiceEntryRequirementCache"}, // 4.0.0+ + {111, nullptr, "InvalidateServiceEntryRequirementCache"}, // 4.0.0+ + {112, nullptr, "InvalidateTokenCache"}, // 4.0.0 - 6.2.0 + {113, nullptr, "GetServiceEntryRequirementCacheForOnlinePlay"}, // 6.1.0+ + {120, nullptr, "GetNintendoAccountId"}, + {121, nullptr, "CalculateNintendoAccountAuthenticationFingerprint"}, // 9.0.0+ + {130, nullptr, "GetNintendoAccountUserResourceCache"}, + {131, nullptr, "RefreshNintendoAccountUserResourceCacheAsync"}, + {132, nullptr, "RefreshNintendoAccountUserResourceCacheAsyncIfSecondsElapsed"}, + {133, nullptr, "GetNintendoAccountVerificationUrlCache"}, // 9.0.0+ + {134, nullptr, "RefreshNintendoAccountVerificationUrlCacheAsync"}, // 9.0.0+ + {135, nullptr, "RefreshNintendoAccountVerificationUrlCacheAsyncIfSecondsElapsed"}, // 9.0.0+ + {140, nullptr, "GetNetworkServiceLicenseCache"}, // 5.0.0+ + {141, nullptr, "RefreshNetworkServiceLicenseCacheAsync"}, // 5.0.0+ + {142, nullptr, "RefreshNetworkServiceLicenseCacheAsyncIfSecondsElapsed"}, // 5.0.0+ + {150, nullptr, "CreateAuthorizationRequest"}, + {200, nullptr, "IsRegistered"}, + {201, nullptr, "RegisterAsync"}, + {202, nullptr, "UnregisterAsync"}, + {203, nullptr, "DeleteRegistrationInfoLocally"}, + {220, nullptr, "SynchronizeProfileAsync"}, + {221, nullptr, "UploadProfileAsync"}, + {222, nullptr, "SynchronizaProfileAsyncIfSecondsElapsed"}, + {250, nullptr, "IsLinkedWithNintendoAccount"}, + {251, nullptr, "CreateProcedureToLinkWithNintendoAccount"}, + {252, nullptr, "ResumeProcedureToLinkWithNintendoAccount"}, + {255, nullptr, "CreateProcedureToUpdateLinkageStateOfNintendoAccount"}, + {256, nullptr, "ResumeProcedureToUpdateLinkageStateOfNintendoAccount"}, + {260, nullptr, "CreateProcedureToLinkNnidWithNintendoAccount"}, // 3.0.0+ + {261, nullptr, "ResumeProcedureToLinkNnidWithNintendoAccount"}, // 3.0.0+ + {280, nullptr, "ProxyProcedureToAcquireApplicationAuthorizationForNintendoAccount"}, + {290, nullptr, "GetRequestForNintendoAccountUserResourceView"}, // 8.0.0+ + {300, nullptr, "TryRecoverNintendoAccountUserStateAsync"}, // 6.0.0+ + {400, nullptr, "IsServiceEntryRequirementCacheRefreshRequiredForOnlinePlay"}, // 6.1.0+ + {401, nullptr, "RefreshServiceEntryRequirementCacheForOnlinePlayAsync"}, // 6.1.0+ + {900, nullptr, "GetAuthenticationInfoForWin"}, // 9.0.0+ + {901, nullptr, "ImportAsyncForWin"}, // 9.0.0+ + {997, nullptr, "DebugUnlinkNintendoAccountAsync"}, + {998, nullptr, "DebugSetAvailabilityErrorDetail"}, + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + +class IAuthorizationRequest final : public ServiceFramework { +public: + explicit IAuthorizationRequest(Common::UUID user_id) + : ServiceFramework("IAuthorizationRequest") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "GetSessionId"}, + {10, nullptr, "InvokeWithoutInteractionAsync"}, + {19, nullptr, "IsAuthorized"}, + {20, nullptr, "GetAuthorizationCode"}, + {21, nullptr, "GetIdToken"}, + {22, nullptr, "GetState"}, + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + +class IOAuthProcedure final : public ServiceFramework { +public: + explicit IOAuthProcedure(Common::UUID user_id) : ServiceFramework("IOAuthProcedure") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "PrepareAsync"}, + {1, nullptr, "GetRequest"}, + {2, nullptr, "ApplyResponse"}, + {3, nullptr, "ApplyResponseAsync"}, + {10, nullptr, "Suspend"}, + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + +// 3.0.0+ +class IOAuthProcedureForExternalNsa final : public ServiceFramework { +public: + explicit IOAuthProcedureForExternalNsa(Common::UUID user_id) + : ServiceFramework("IOAuthProcedureForExternalNsa") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "PrepareAsync"}, + {1, nullptr, "GetRequest"}, + {2, nullptr, "ApplyResponse"}, + {3, nullptr, "ApplyResponseAsync"}, + {10, nullptr, "Suspend"}, + {100, nullptr, "GetAccountId"}, + {101, nullptr, "GetLinkedNintendoAccountId"}, + {102, nullptr, "GetNickname"}, + {103, nullptr, "GetProfileImage"}, + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + +class IOAuthProcedureForNintendoAccountLinkage final + : public ServiceFramework { +public: + explicit IOAuthProcedureForNintendoAccountLinkage(Common::UUID user_id) + : ServiceFramework("IOAuthProcedureForNintendoAccountLinkage") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "PrepareAsync"}, + {1, nullptr, "GetRequest"}, + {2, nullptr, "ApplyResponse"}, + {3, nullptr, "ApplyResponseAsync"}, + {10, nullptr, "Suspend"}, + {100, nullptr, "GetRequestWithTheme"}, + {101, nullptr, "IsNetworkServiceAccountReplaced"}, + {199, nullptr, "GetUrlForIntroductionOfExtraMembership"}, // 2.0.0 - 5.1.0 + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + +class INotifier final : public ServiceFramework { +public: + explicit INotifier(Common::UUID user_id) : ServiceFramework("INotifier") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "GetSystemEvent"}, + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + class IProfileCommon : public ServiceFramework { public: explicit IProfileCommon(const char* name, bool editor_commands, Common::UUID user_id, @@ -226,6 +438,54 @@ public: : IProfileCommon("IProfileEditor", true, user_id, profile_manager) {} }; +class IAsyncContext final : public ServiceFramework { +public: + explicit IAsyncContext(Common::UUID user_id) : ServiceFramework("IAsyncContext") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "GetSystemEvent"}, + {1, nullptr, "Cancel"}, + {2, nullptr, "HasDone"}, + {3, nullptr, "GetResult"}, + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + +class ISessionObject final : public ServiceFramework { +public: + explicit ISessionObject(Common::UUID user_id) : ServiceFramework("ISessionObject") { + // clang-format off + static const FunctionInfo functions[] = { + {999, nullptr, "Dummy"}, + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + +class IGuestLoginRequest final : public ServiceFramework { +public: + explicit IGuestLoginRequest(Common::UUID) : ServiceFramework("IGuestLoginRequest") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "GetSessionId"}, + {11, nullptr, "Unknown"}, // 1.0.0 - 2.3.0 (the name is blank on Switchbrew) + {12, nullptr, "GetAccountId"}, + {13, nullptr, "GetLinkedNintendoAccountId"}, + {14, nullptr, "GetNickname"}, + {15, nullptr, "GetProfileImage"}, + {21, nullptr, "LoadIdTokenCache"}, // 3.0.0+ + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + class IManagerForApplication final : public ServiceFramework { public: explicit IManagerForApplication(Common::UUID user_id) @@ -265,6 +525,87 @@ private: Common::UUID user_id; }; +// 6.0.0+ +class IAsyncNetworkServiceLicenseKindContext final + : public ServiceFramework { +public: + explicit IAsyncNetworkServiceLicenseKindContext(Common::UUID user_id) + : ServiceFramework("IAsyncNetworkServiceLicenseKindContext") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "GetSystemEvent"}, + {1, nullptr, "Cancel"}, + {2, nullptr, "HasDone"}, + {3, nullptr, "GetResult"}, + {4, nullptr, "GetNetworkServiceLicenseKind"}, + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + +// 8.0.0+ +class IOAuthProcedureForUserRegistration final + : public ServiceFramework { +public: + explicit IOAuthProcedureForUserRegistration(Common::UUID user_id) + : ServiceFramework("IOAuthProcedureForUserRegistration") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "PrepareAsync"}, + {1, nullptr, "GetRequest"}, + {2, nullptr, "ApplyResponse"}, + {3, nullptr, "ApplyResponseAsync"}, + {10, nullptr, "Suspend"}, + {100, nullptr, "GetAccountId"}, + {101, nullptr, "GetLinkedNintendoAccountId"}, + {102, nullptr, "GetNickname"}, + {103, nullptr, "GetProfileImage"}, + {110, nullptr, "RegisterUserAsync"}, + {111, nullptr, "GetUid"}, + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + +class DAUTH_O final : public ServiceFramework { +public: + explicit DAUTH_O(Common::UUID) : ServiceFramework("dauth:o") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "EnsureAuthenticationTokenCacheAsync"}, // [5.0.0-5.1.0] GeneratePostData + {1, nullptr, "LoadAuthenticationTokenCache"}, // 6.0.0+ + {2, nullptr, "InvalidateAuthenticationTokenCache"}, // 6.0.0+ + {10, nullptr, "EnsureEdgeTokenCacheAsync"}, // 6.0.0+ + {11, nullptr, "LoadEdgeTokenCache"}, // 6.0.0+ + {12, nullptr, "InvalidateEdgeTokenCache"}, // 6.0.0+ + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + +// 6.0.0+ +class IAsyncResult final : public ServiceFramework { +public: + explicit IAsyncResult(Common::UUID user_id) : ServiceFramework("IAsyncResult") { + // clang-format off + static const FunctionInfo functions[] = { + {0, nullptr, "GetResult"}, + {1, nullptr, "Cancel"}, + {2, nullptr, "IsAvailable"}, + {3, nullptr, "GetSystemEvent"}, + }; + // clang-format on + + RegisterHandlers(functions); + } +}; + void Module::Interface::GetUserCount(Kernel::HLERequestContext& ctx) { LOG_DEBUG(Service_ACC, "called"); IPC::ResponseBuilder rb{ctx, 3}; diff --git a/src/core/hle/service/acc/acc_aa.cpp b/src/core/hle/service/acc/acc_aa.cpp index 3bac6bcd1f..51f119b122 100644 --- a/src/core/hle/service/acc/acc_aa.cpp +++ b/src/core/hle/service/acc/acc_aa.cpp @@ -13,8 +13,8 @@ ACC_AA::ACC_AA(std::shared_ptr module, std::shared_ptr p {0, nullptr, "EnsureCacheAsync"}, {1, nullptr, "LoadCache"}, {2, nullptr, "GetDeviceAccountId"}, - {50, nullptr, "RegisterNotificationTokenAsync"}, - {51, nullptr, "UnregisterNotificationTokenAsync"}, + {50, nullptr, "RegisterNotificationTokenAsync"}, // 1.0.0 - 6.2.0 + {51, nullptr, "UnregisterNotificationTokenAsync"}, // 1.0.0 - 6.2.0 }; RegisterHandlers(functions); } diff --git a/src/core/hle/service/acc/acc_su.cpp b/src/core/hle/service/acc/acc_su.cpp index 2eefc6df56..85620bde38 100644 --- a/src/core/hle/service/acc/acc_su.cpp +++ b/src/core/hle/service/acc/acc_su.cpp @@ -17,28 +17,28 @@ ACC_SU::ACC_SU(std::shared_ptr module, std::shared_ptr p {3, &ACC_SU::ListOpenUsers, "ListOpenUsers"}, {4, &ACC_SU::GetLastOpenedUser, "GetLastOpenedUser"}, {5, &ACC_SU::GetProfile, "GetProfile"}, - {6, nullptr, "GetProfileDigest"}, + {6, nullptr, "GetProfileDigest"}, // 3.0.0+ {50, &ACC_SU::IsUserRegistrationRequestPermitted, "IsUserRegistrationRequestPermitted"}, {51, &ACC_SU::TrySelectUserWithoutInteraction, "TrySelectUserWithoutInteraction"}, - {60, nullptr, "ListOpenContextStoredUsers"}, - {99, nullptr, "DebugActivateOpenContextRetention"}, + {60, nullptr, "ListOpenContextStoredUsers"}, // 5.0.0 - 5.1.0 + {99, nullptr, "DebugActivateOpenContextRetention"}, // 6.0.0+ {100, nullptr, "GetUserRegistrationNotifier"}, {101, nullptr, "GetUserStateChangeNotifier"}, {102, nullptr, "GetBaasAccountManagerForSystemService"}, {103, nullptr, "GetBaasUserAvailabilityChangeNotifier"}, {104, nullptr, "GetProfileUpdateNotifier"}, - {105, nullptr, "CheckNetworkServiceAvailabilityAsync"}, - {106, nullptr, "GetProfileSyncNotifier"}, + {105, nullptr, "CheckNetworkServiceAvailabilityAsync"}, // 4.0.0+ + {106, nullptr, "GetProfileSyncNotifier"}, // 9.0.0+ {110, nullptr, "StoreSaveDataThumbnail"}, {111, nullptr, "ClearSaveDataThumbnail"}, {112, nullptr, "LoadSaveDataThumbnail"}, - {113, nullptr, "GetSaveDataThumbnailExistence"}, - {120, nullptr, "ListOpenUsersInApplication"}, - {130, nullptr, "ActivateOpenContextRetention"}, - {140, &ACC_SU::ListQualifiedUsers, "ListQualifiedUsers"}, - {150, nullptr, "AuthenticateApplicationAsync"}, - {190, nullptr, "GetUserLastOpenedApplication"}, - {191, nullptr, "ActivateOpenContextHolder"}, + {113, nullptr, "GetSaveDataThumbnailExistence"}, // 5.0.0+ + {120, nullptr, "ListOpenUsersInApplication"}, // 10.0.0+ + {130, nullptr, "ActivateOpenContextRetention"}, // 6.0.0+ + {140, &ACC_SU::ListQualifiedUsers, "ListQualifiedUsers"}, // 6.0.0+ + {150, nullptr, "AuthenticateApplicationAsync"}, // 10.0.0+ + {190, nullptr, "GetUserLastOpenedApplication"}, // 1.0.0 - 9.2.0 + {191, nullptr, "ActivateOpenContextHolder"}, // 7.0.0+ {200, nullptr, "BeginUserRegistration"}, {201, nullptr, "CompleteUserRegistration"}, {202, nullptr, "CancelUserRegistration"}, @@ -46,15 +46,15 @@ ACC_SU::ACC_SU(std::shared_ptr module, std::shared_ptr p {204, nullptr, "SetUserPosition"}, {205, &ACC_SU::GetProfileEditor, "GetProfileEditor"}, {206, nullptr, "CompleteUserRegistrationForcibly"}, - {210, nullptr, "CreateFloatingRegistrationRequest"}, - {211, nullptr, "CreateProcedureToRegisterUserWithNintendoAccount"}, - {212, nullptr, "ResumeProcedureToRegisterUserWithNintendoAccount"}, + {210, nullptr, "CreateFloatingRegistrationRequest"}, // 3.0.0+ + {211, nullptr, "CreateProcedureToRegisterUserWithNintendoAccount"}, // 8.0.0+ + {212, nullptr, "ResumeProcedureToRegisterUserWithNintendoAccount"}, // 8.0.0+ {230, nullptr, "AuthenticateServiceAsync"}, {250, nullptr, "GetBaasAccountAdministrator"}, {290, nullptr, "ProxyProcedureForGuestLoginWithNintendoAccount"}, - {291, nullptr, "ProxyProcedureForFloatingRegistrationWithNintendoAccount"}, + {291, nullptr, "ProxyProcedureForFloatingRegistrationWithNintendoAccount"}, // 3.0.0+ {299, nullptr, "SuspendBackgroundDaemon"}, - {997, nullptr, "DebugInvalidateTokenCacheForUser"}, + {997, nullptr, "DebugInvalidateTokenCacheForUser"}, // 3.0.0+ {998, nullptr, "DebugSetUserStateClose"}, {999, nullptr, "DebugSetUserStateOpen"}, }; diff --git a/src/core/hle/service/acc/acc_u0.cpp b/src/core/hle/service/acc/acc_u0.cpp index fb4e7e772b..49f6e20f1b 100644 --- a/src/core/hle/service/acc/acc_u0.cpp +++ b/src/core/hle/service/acc/acc_u0.cpp @@ -17,23 +17,23 @@ ACC_U0::ACC_U0(std::shared_ptr module, std::shared_ptr p {3, &ACC_U0::ListOpenUsers, "ListOpenUsers"}, {4, &ACC_U0::GetLastOpenedUser, "GetLastOpenedUser"}, {5, &ACC_U0::GetProfile, "GetProfile"}, - {6, nullptr, "GetProfileDigest"}, + {6, nullptr, "GetProfileDigest"}, // 3.0.0+ {50, &ACC_U0::IsUserRegistrationRequestPermitted, "IsUserRegistrationRequestPermitted"}, {51, &ACC_U0::TrySelectUserWithoutInteraction, "TrySelectUserWithoutInteraction"}, - {60, nullptr, "ListOpenContextStoredUsers"}, - {99, nullptr, "DebugActivateOpenContextRetention"}, + {60, nullptr, "ListOpenContextStoredUsers"}, // 5.0.0 - 5.1.0 + {99, nullptr, "DebugActivateOpenContextRetention"}, // 6.0.0+ {100, &ACC_U0::InitializeApplicationInfo, "InitializeApplicationInfo"}, {101, &ACC_U0::GetBaasAccountManagerForApplication, "GetBaasAccountManagerForApplication"}, {102, nullptr, "AuthenticateApplicationAsync"}, - {103, nullptr, "CheckNetworkServiceAvailabilityAsync"}, + {103, nullptr, "CheckNetworkServiceAvailabilityAsync"}, // 4.0.0+ {110, nullptr, "StoreSaveDataThumbnail"}, {111, nullptr, "ClearSaveDataThumbnail"}, {120, nullptr, "CreateGuestLoginRequest"}, - {130, nullptr, "LoadOpenContext"}, - {131, nullptr, "ListOpenContextStoredUsers"}, - {140, &ACC_U0::InitializeApplicationInfoRestricted, "InitializeApplicationInfoRestricted"}, - {141, &ACC_U0::ListQualifiedUsers, "ListQualifiedUsers"}, - {150, &ACC_U0::IsUserAccountSwitchLocked, "IsUserAccountSwitchLocked"}, + {130, nullptr, "LoadOpenContext"}, // 5.0.0+ + {131, nullptr, "ListOpenContextStoredUsers"}, // 6.0.0+ + {140, &ACC_U0::InitializeApplicationInfoRestricted, "InitializeApplicationInfoRestricted"}, // 6.0.0+ + {141, &ACC_U0::ListQualifiedUsers, "ListQualifiedUsers"}, // 6.0.0+ + {150, &ACC_U0::IsUserAccountSwitchLocked, "IsUserAccountSwitchLocked"}, // 6.0.0+ }; // clang-format on diff --git a/src/core/hle/service/acc/acc_u1.cpp b/src/core/hle/service/acc/acc_u1.cpp index 9f29cdc82c..f47004f84c 100644 --- a/src/core/hle/service/acc/acc_u1.cpp +++ b/src/core/hle/service/acc/acc_u1.cpp @@ -17,28 +17,29 @@ ACC_U1::ACC_U1(std::shared_ptr module, std::shared_ptr p {3, &ACC_U1::ListOpenUsers, "ListOpenUsers"}, {4, &ACC_U1::GetLastOpenedUser, "GetLastOpenedUser"}, {5, &ACC_U1::GetProfile, "GetProfile"}, - {6, nullptr, "GetProfileDigest"}, + {6, nullptr, "GetProfileDigest"}, // 3.0.0+ {50, &ACC_U1::IsUserRegistrationRequestPermitted, "IsUserRegistrationRequestPermitted"}, {51, &ACC_U1::TrySelectUserWithoutInteraction, "TrySelectUserWithoutInteraction"}, - {60, nullptr, "ListOpenContextStoredUsers"}, - {99, nullptr, "DebugActivateOpenContextRetention"}, + {60, nullptr, "ListOpenContextStoredUsers"}, // 5.0.0 - 5.1.0 + {99, nullptr, "DebugActivateOpenContextRetention"}, // 6.0.0+ {100, nullptr, "GetUserRegistrationNotifier"}, {101, nullptr, "GetUserStateChangeNotifier"}, {102, nullptr, "GetBaasAccountManagerForSystemService"}, - {103, nullptr, "GetProfileUpdateNotifier"}, - {104, nullptr, "CheckNetworkServiceAvailabilityAsync"}, - {105, nullptr, "GetBaasUserAvailabilityChangeNotifier"}, - {106, nullptr, "GetProfileSyncNotifier"}, + {103, nullptr, "GetBaasUserAvailabilityChangeNotifier"}, + {104, nullptr, "GetProfileUpdateNotifier"}, + {105, nullptr, "CheckNetworkServiceAvailabilityAsync"}, // 4.0.0+ + {106, nullptr, "GetProfileSyncNotifier"}, // 9.0.0+ {110, nullptr, "StoreSaveDataThumbnail"}, {111, nullptr, "ClearSaveDataThumbnail"}, {112, nullptr, "LoadSaveDataThumbnail"}, - {113, nullptr, "GetSaveDataThumbnailExistence"}, - {130, nullptr, "ActivateOpenContextRetention"}, - {140, &ACC_U1::ListQualifiedUsers, "ListQualifiedUsers"}, - {150, nullptr, "AuthenticateApplicationAsync"}, - {190, nullptr, "GetUserLastOpenedApplication"}, - {191, nullptr, "ActivateOpenContextHolder"}, - {997, nullptr, "DebugInvalidateTokenCacheForUser"}, + {113, nullptr, "GetSaveDataThumbnailExistence"}, // 5.0.0+ + {120, nullptr, "ListOpenUsersInApplication"}, // 10.0.0+ + {130, nullptr, "ActivateOpenContextRetention"}, // 6.0.0+ + {140, &ACC_U1::ListQualifiedUsers, "ListQualifiedUsers"}, // 6.0.0+ + {150, nullptr, "AuthenticateApplicationAsync"}, // 10.0.0+ + {190, nullptr, "GetUserLastOpenedApplication"}, // 1.0.0 - 9.2.0 + {191, nullptr, "ActivateOpenContextHolder"}, // 7.0.0+ + {997, nullptr, "DebugInvalidateTokenCacheForUser"}, // 3.0.0+ {998, nullptr, "DebugSetUserStateClose"}, {999, nullptr, "DebugSetUserStateOpen"}, }; From 45dac6bc5cce693cf7affdfaffff433be8acd67f Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Mon, 22 Jun 2020 22:53:31 -0400 Subject: [PATCH 130/145] lm: Silence no return value warning --- src/core/hle/service/lm/manager.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/hle/service/lm/manager.cpp b/src/core/hle/service/lm/manager.cpp index b67081b86f..3ee2374e72 100644 --- a/src/core/hle/service/lm/manager.cpp +++ b/src/core/hle/service/lm/manager.cpp @@ -86,7 +86,8 @@ std::string FormatField(Field type, const std::vector& data) { return Common::StringFromFixedZeroTerminatedBuffer( reinterpret_cast(data.data()), data.size()); default: - UNIMPLEMENTED(); + UNIMPLEMENTED_MSG("Unimplemented field type={}", type); + return ""; } } From b8798a995bd8aaefc0d21577c9d129e19b39fb15 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Mon, 22 Jun 2020 22:54:09 -0400 Subject: [PATCH 131/145] yuzu_tester: Silence type conversion warning --- src/yuzu_tester/service/yuzutest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/yuzu_tester/service/yuzutest.cpp b/src/yuzu_tester/service/yuzutest.cpp index 85d3f436b4..2d3f6e3a7d 100644 --- a/src/yuzu_tester/service/yuzutest.cpp +++ b/src/yuzu_tester/service/yuzutest.cpp @@ -53,7 +53,7 @@ private: IPC::ResponseBuilder rb{ctx, 3}; rb.Push(RESULT_SUCCESS); - rb.Push(write_size); + rb.Push(static_cast(write_size)); } void StartIndividual(Kernel::HLERequestContext& ctx) { From 6ce5f3120be6a65a798d3abc6fda0fe6171d0296 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Fri, 5 Jun 2020 01:42:19 +1000 Subject: [PATCH 132/145] Macro HLE support --- src/video_core/CMakeLists.txt | 2 + src/video_core/engines/maxwell_3d.cpp | 2 +- src/video_core/engines/maxwell_3d.h | 4 + src/video_core/macro/macro.cpp | 35 ++++++- src/video_core/macro/macro.h | 19 +++- src/video_core/macro/macro_hle.cpp | 108 +++++++++++++++++++++ src/video_core/macro/macro_hle.h | 43 ++++++++ src/video_core/macro/macro_interpreter.cpp | 3 +- src/video_core/macro/macro_jit_x64.cpp | 3 +- 9 files changed, 209 insertions(+), 10 deletions(-) create mode 100644 src/video_core/macro/macro_hle.cpp create mode 100644 src/video_core/macro/macro_hle.h diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 099bb446e1..2dc752aa9e 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -27,6 +27,8 @@ add_library(video_core STATIC engines/shader_type.h macro/macro.cpp macro/macro.h + macro/macro_hle.cpp + macro/macro_hle.h macro/macro_interpreter.cpp macro/macro_interpreter.h macro/macro_jit_x64.cpp diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index ea3c8a9635..c01436295c 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -128,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, const std::vector& parameters) ((method - MacroRegistersStart) >> 1) % static_cast(macro_positions.size()); // Execute the current macro. - macro_engine->Execute(macro_positions[entry], parameters); + macro_engine->Execute(*this, macro_positions[entry], parameters); if (mme_draw.current_mode != MMEDrawMode::Undefined) { FlushMMEInlineDraw(); } diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index d5fe25065a..5926c4d2de 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1418,6 +1418,10 @@ public: return execute_on; } + VideoCore::RasterizerInterface& GetRasterizer() { + return rasterizer; + } + /// Notify a memory write has happened. void OnMemoryWrite() { dirty.flags |= dirty.on_write_stores; diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp index 89077a2d85..c8aa2534a7 100644 --- a/src/video_core/macro/macro.cpp +++ b/src/video_core/macro/macro.cpp @@ -2,23 +2,37 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include #include "common/assert.h" #include "common/logging/log.h" #include "core/settings.h" +#include "video_core/engines/maxwell_3d.h" #include "video_core/macro/macro.h" +#include "video_core/macro/macro_hle.h" #include "video_core/macro/macro_interpreter.h" #include "video_core/macro/macro_jit_x64.h" namespace Tegra { +MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d) + : hle_macros{std::make_unique(maxwell3d)} {} + +MacroEngine::~MacroEngine() {} + void MacroEngine::AddCode(u32 method, u32 data) { uploaded_macro_code[method].push_back(data); } -void MacroEngine::Execute(u32 method, const std::vector& parameters) { +void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method, + const std::vector& parameters) { auto compiled_macro = macro_cache.find(method); if (compiled_macro != macro_cache.end()) { - compiled_macro->second->Execute(parameters, method); + const auto& cache_info = compiled_macro->second; + if (cache_info.has_hle_program) { + cache_info.hle_program->Execute(parameters, method); + } else { + cache_info.lle_program->Execute(parameters, method); + } } else { // Macro not compiled, check if it's uploaded and if so, compile it auto macro_code = uploaded_macro_code.find(method); @@ -26,8 +40,21 @@ void MacroEngine::Execute(u32 method, const std::vector& parameters) { UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method); return; } - macro_cache[method] = Compile(macro_code->second); - macro_cache[method]->Execute(parameters, method); + auto& cache_info = macro_cache[method]; + cache_info.hash = boost::hash_value(macro_code->second); + cache_info.lle_program = Compile(macro_code->second); + + auto hle_program = hle_macros->GetHLEProgram(cache_info.hash); + if (hle_program.has_value()) { + cache_info.has_hle_program = true; + cache_info.hle_program = std::move(hle_program.value()); + } + + if (cache_info.has_hle_program) { + cache_info.hle_program->Execute(parameters, method); + } else { + cache_info.lle_program->Execute(parameters, method); + } } } diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h index b76ed891f1..5fa8023af6 100644 --- a/src/video_core/macro/macro.h +++ b/src/video_core/macro/macro.h @@ -11,9 +11,11 @@ #include "common/common_types.h" namespace Tegra { + namespace Engines { class Maxwell3D; } + namespace Macro { constexpr std::size_t NUM_MACRO_REGISTERS = 8; enum class Operation : u32 { @@ -94,6 +96,8 @@ union MethodAddress { } // namespace Macro +class HLEMacro; + class CachedMacro { public: virtual ~CachedMacro() = default; @@ -107,20 +111,29 @@ public: class MacroEngine { public: - virtual ~MacroEngine() = default; + MacroEngine(Engines::Maxwell3D& maxwell3d); + virtual ~MacroEngine(); // Store the uploaded macro code to compile them when they're called. void AddCode(u32 method, u32 data); // Compiles the macro if its not in the cache, and executes the compiled macro - void Execute(u32 method, const std::vector& parameters); + void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector& parameters); protected: virtual std::unique_ptr Compile(const std::vector& code) = 0; private: - std::unordered_map> macro_cache; + struct CacheInfo { + std::unique_ptr lle_program{}; + std::unique_ptr hle_program{}; + u64 hash{}; + bool has_hle_program{}; + }; + + std::unordered_map macro_cache; std::unordered_map> uploaded_macro_code; + std::unique_ptr hle_macros; }; std::unique_ptr GetMacroEngine(Engines::Maxwell3D& maxwell3d); diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp new file mode 100644 index 0000000000..51827c8226 --- /dev/null +++ b/src/video_core/macro/macro_hle.cpp @@ -0,0 +1,108 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include "video_core/engines/maxwell_3d.h" +#include "video_core/macro/macro_hle.h" +#include "video_core/rasterizer_interface.h" + +namespace Tegra { + +// HLE'd functions +static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, + const std::vector& parameters) { + const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B); + + maxwell3d.regs.draw.topology.Assign( + static_cast(parameters[0] & + ~(0x3ffffff << 26))); + maxwell3d.regs.vb_base_instance = parameters[5]; + maxwell3d.mme_draw.instance_count = instance_count; + maxwell3d.regs.vb_element_base = parameters[3]; + maxwell3d.regs.index_array.count = parameters[1]; + maxwell3d.regs.index_array.first = parameters[4]; + + if (maxwell3d.ShouldExecute()) { + maxwell3d.GetRasterizer().Draw(true, true); + } + maxwell3d.regs.index_array.count = 0; + maxwell3d.mme_draw.instance_count = 0; +} + +static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, + const std::vector& parameters) { + const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); + + maxwell3d.regs.vertex_buffer.first = parameters[3]; + maxwell3d.regs.vertex_buffer.count = parameters[1]; + maxwell3d.regs.vb_base_instance = parameters[4]; + maxwell3d.regs.draw.topology.Assign( + static_cast(parameters[0])); + maxwell3d.mme_draw.instance_count = count; + + if (maxwell3d.ShouldExecute()) { + maxwell3d.GetRasterizer().Draw(false, true); + } + maxwell3d.regs.vertex_buffer.count = 0; + maxwell3d.mme_draw.instance_count = 0; +} + +static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, + const std::vector& parameters) { + const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); + const u32 element_base = parameters[4]; + const u32 base_instance = parameters[5]; + maxwell3d.regs.index_array.first = parameters[3]; + maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base? + maxwell3d.regs.index_array.count = parameters[1]; + maxwell3d.regs.vb_element_base = element_base; + maxwell3d.regs.vb_base_instance = base_instance; + maxwell3d.regs.const_buffer.cb_pos = 0x640; + maxwell3d.mme_draw.instance_count = instance_count; + maxwell3d.regs.const_buffer.cb_data[0] = element_base; + maxwell3d.regs.const_buffer.cb_data[1] = base_instance; + maxwell3d.regs.draw.topology.Assign( + static_cast(parameters[0])); + if (maxwell3d.ShouldExecute()) { + maxwell3d.GetRasterizer().Draw(true, true); + } + maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base? + maxwell3d.regs.index_array.count = 0; + maxwell3d.regs.vb_element_base = 0x0; + maxwell3d.regs.vb_base_instance = 0x0; + maxwell3d.regs.const_buffer.cb_pos = 0x640; + maxwell3d.regs.const_buffer.cb_data[0] = 0; + maxwell3d.regs.const_buffer.cb_data[1] = 0; + maxwell3d.mme_draw.instance_count = 0; +} + +static const std::unordered_map hle_funcs{ + {0x771BB18C62444DA0, &HLE_771BB18C62444DA0}, + {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD}, + {0x0217920100488FF7, &HLE_0217920100488FF7}, +}; + +HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +HLEMacro::~HLEMacro() = default; + +std::optional> HLEMacro::GetHLEProgram(u64 hash) const { + auto it = hle_funcs.find(hash); + if (it != hle_funcs.end()) { + return std::make_unique(maxwell3d, it->second); + } else { + return {}; + } +} + +HLEMacroImpl::~HLEMacroImpl() = default; + +HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func) + : maxwell3d(maxwell3d), func(func) {} + +void HLEMacroImpl::Execute(const std::vector& parameters, u32 method) { + func(maxwell3d, parameters); +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h new file mode 100644 index 0000000000..de7f43dc47 --- /dev/null +++ b/src/video_core/macro/macro_hle.h @@ -0,0 +1,43 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include "common/common_types.h" +#include "video_core/macro/macro.h" + +namespace Tegra { + +namespace Engines { +class Maxwell3D; +} + +using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector& parameters); + +class HLEMacro { +public: + HLEMacro(Engines::Maxwell3D& maxwell3d); + ~HLEMacro(); + std::optional> GetHLEProgram(u64 hash) const; + +private: + Engines::Maxwell3D& maxwell3d; +}; + +class HLEMacroImpl : public CachedMacro { +public: + explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func); + ~HLEMacroImpl(); + + void Execute(const std::vector& parameters, u32 method) override; + +private: + Engines::Maxwell3D& maxwell3d; + HLEFunction func; +}; + +} // namespace Tegra diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp index 5edff27aad..aa52564197 100644 --- a/src/video_core/macro/macro_interpreter.cpp +++ b/src/video_core/macro/macro_interpreter.cpp @@ -11,7 +11,8 @@ MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192)); namespace Tegra { -MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) + : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {} std::unique_ptr MacroInterpreter::Compile(const std::vector& code) { return std::make_unique(maxwell3d, code); diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 30abb66e5d..07292702f8 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -28,7 +28,8 @@ static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ BRANCH_HOLDER, }); -MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) + : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {} std::unique_ptr MacroJITx64::Compile(const std::vector& code) { return std::make_unique(maxwell3d, code); From 74b4334d510b58d96e8305bc3f5a7c8d05e842ba Mon Sep 17 00:00:00 2001 From: David Marcec Date: Fri, 5 Jun 2020 12:59:59 +1000 Subject: [PATCH 133/145] Fix constbuffer for 0217920100488FF7 --- src/video_core/macro/macro_hle.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index 51827c8226..887f403100 100644 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -59,10 +59,10 @@ static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, maxwell3d.regs.index_array.count = parameters[1]; maxwell3d.regs.vb_element_base = element_base; maxwell3d.regs.vb_base_instance = base_instance; - maxwell3d.regs.const_buffer.cb_pos = 0x640; maxwell3d.mme_draw.instance_count = instance_count; - maxwell3d.regs.const_buffer.cb_data[0] = element_base; - maxwell3d.regs.const_buffer.cb_data[1] = base_instance; + maxwell3d.CallMethodFromMME(0x8e3, 0x640); + maxwell3d.CallMethodFromMME(0x8e4, element_base); + maxwell3d.CallMethodFromMME(0x8e5, base_instance); maxwell3d.regs.draw.topology.Assign( static_cast(parameters[0])); if (maxwell3d.ShouldExecute()) { @@ -72,10 +72,10 @@ static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, maxwell3d.regs.index_array.count = 0; maxwell3d.regs.vb_element_base = 0x0; maxwell3d.regs.vb_base_instance = 0x0; - maxwell3d.regs.const_buffer.cb_pos = 0x640; - maxwell3d.regs.const_buffer.cb_data[0] = 0; - maxwell3d.regs.const_buffer.cb_data[1] = 0; maxwell3d.mme_draw.instance_count = 0; + maxwell3d.CallMethodFromMME(0x8e3, 0x640); + maxwell3d.CallMethodFromMME(0x8e4, 0x0); + maxwell3d.CallMethodFromMME(0x8e5, 0x0); } static const std::unordered_map hle_funcs{ From fabdf5d3850c078d173653f259845c26a2ce6e7d Mon Sep 17 00:00:00 2001 From: David Marcec Date: Fri, 5 Jun 2020 13:09:52 +1000 Subject: [PATCH 134/145] Addressed issues --- src/video_core/engines/maxwell_3d.h | 4 ++++ src/video_core/macro/macro.cpp | 2 +- src/video_core/macro/macro.h | 2 +- src/video_core/macro/macro_hle.cpp | 20 ++++++++++---------- src/video_core/macro/macro_hle.h | 2 +- 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 5926c4d2de..ef1618990b 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1422,6 +1422,10 @@ public: return rasterizer; } + const VideoCore::RasterizerInterface& GetRasterizer() const { + return rasterizer; + } + /// Notify a memory write has happened. void OnMemoryWrite() { dirty.flags |= dirty.on_write_stores; diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp index c8aa2534a7..ef7dad349e 100644 --- a/src/video_core/macro/macro.cpp +++ b/src/video_core/macro/macro.cpp @@ -17,7 +17,7 @@ namespace Tegra { MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d) : hle_macros{std::make_unique(maxwell3d)} {} -MacroEngine::~MacroEngine() {} +MacroEngine::~MacroEngine() = default; void MacroEngine::AddCode(u32 method, u32 data) { uploaded_macro_code[method].push_back(data); diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h index 5fa8023af6..4d00b84b02 100644 --- a/src/video_core/macro/macro.h +++ b/src/video_core/macro/macro.h @@ -111,7 +111,7 @@ public: class MacroEngine { public: - MacroEngine(Engines::Maxwell3D& maxwell3d); + explicit MacroEngine(Engines::Maxwell3D& maxwell3d); virtual ~MacroEngine(); // Store the uploaded macro code to compile them when they're called. diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index 887f403100..1f1348df35 100644 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -2,7 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include +#include #include #include "video_core/engines/maxwell_3d.h" #include "video_core/macro/macro_hle.h" @@ -78,22 +78,22 @@ static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, maxwell3d.CallMethodFromMME(0x8e5, 0x0); } -static const std::unordered_map hle_funcs{ - {0x771BB18C62444DA0, &HLE_771BB18C62444DA0}, - {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD}, - {0x0217920100488FF7, &HLE_0217920100488FF7}, +static const std::array, 3> hle_funcs{ + std::make_pair(0x771BB18C62444DA0, &HLE_771BB18C62444DA0), + std::make_pair(0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD), + std::make_pair(0x0217920100488FF7, &HLE_0217920100488FF7), }; HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} HLEMacro::~HLEMacro() = default; std::optional> HLEMacro::GetHLEProgram(u64 hash) const { - auto it = hle_funcs.find(hash); - if (it != hle_funcs.end()) { - return std::make_unique(maxwell3d, it->second); - } else { - return {}; + const auto it = std::find_if(hle_funcs.begin(), hle_funcs.end(), + [hash](auto& pair) { return pair.first == hash; }); + if (it == hle_funcs.end()) { + return std::nullopt; } + return std::make_unique(maxwell3d, it->second); } HLEMacroImpl::~HLEMacroImpl() = default; diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h index de7f43dc47..7cd492a8f3 100644 --- a/src/video_core/macro/macro_hle.h +++ b/src/video_core/macro/macro_hle.h @@ -20,7 +20,7 @@ using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector> GetHLEProgram(u64 hash) const; From 52340e94ac5a64572643f01a23316ad492a40f66 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Fri, 5 Jun 2020 14:00:00 +1000 Subject: [PATCH 135/145] clear mme draw mode We already draw, so we can clear it --- src/video_core/macro/macro_hle.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index 1f1348df35..689533f6af 100644 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -29,6 +29,7 @@ static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, } maxwell3d.regs.index_array.count = 0; maxwell3d.mme_draw.instance_count = 0; + maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; } static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, @@ -47,6 +48,7 @@ static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, } maxwell3d.regs.vertex_buffer.count = 0; maxwell3d.mme_draw.instance_count = 0; + maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; } static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, @@ -76,6 +78,7 @@ static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, maxwell3d.CallMethodFromMME(0x8e3, 0x640); maxwell3d.CallMethodFromMME(0x8e4, 0x0); maxwell3d.CallMethodFromMME(0x8e5, 0x0); + maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; } static const std::array, 3> hle_funcs{ From f5e2aec4220ee2b72ec2986e0e60625897b2fd44 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Wed, 24 Jun 2020 12:18:33 +1000 Subject: [PATCH 136/145] addressed issues --- src/video_core/macro/macro_hle.cpp | 10 ++++++---- src/video_core/macro/macro_hle.h | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index 689533f6af..410f99018a 100644 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -10,6 +10,7 @@ namespace Tegra { +namespace { // HLE'd functions static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) { @@ -80,19 +81,20 @@ static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, maxwell3d.CallMethodFromMME(0x8e5, 0x0); maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; } +} // namespace -static const std::array, 3> hle_funcs{ +constexpr std::array, 3> hle_funcs{{ std::make_pair(0x771BB18C62444DA0, &HLE_771BB18C62444DA0), std::make_pair(0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD), std::make_pair(0x0217920100488FF7, &HLE_0217920100488FF7), -}; +}}; HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} HLEMacro::~HLEMacro() = default; std::optional> HLEMacro::GetHLEProgram(u64 hash) const { - const auto it = std::find_if(hle_funcs.begin(), hle_funcs.end(), - [hash](auto& pair) { return pair.first == hash; }); + const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(), + [hash](const auto& pair) { return pair.first == hash; }); if (it == hle_funcs.end()) { return std::nullopt; } diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h index 7cd492a8f3..37af875a0e 100644 --- a/src/video_core/macro/macro_hle.h +++ b/src/video_core/macro/macro_hle.h @@ -22,6 +22,7 @@ class HLEMacro { public: explicit HLEMacro(Engines::Maxwell3D& maxwell3d); ~HLEMacro(); + std::optional> GetHLEProgram(u64 hash) const; private: From da79ec9565f670bcf1f09fdf7d9ae0241d97a241 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Mon, 11 May 2020 16:18:53 -0300 Subject: [PATCH 137/145] gl_stream_buffer: Always use persistent memory maps yuzu no longer supports platforms without persistent maps. --- .../renderer_opengl/gl_stream_buffer.cpp | 40 ++++++------------- .../renderer_opengl/gl_stream_buffer.h | 4 +- 2 files changed, 14 insertions(+), 30 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 932a2f69ef..9cf0f6b46a 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -14,8 +14,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", namespace OpenGL { -OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent, - bool use_persistent) +OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent) : buffer_size(size) { gl_buffer.Create(); @@ -29,23 +28,16 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p allocate_size *= 2; } - if (use_persistent) { - persistent = true; - coherent = prefer_coherent; - const GLbitfield flags = - GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0); - glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); - mapped_ptr = static_cast(glMapNamedBufferRange( - gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT))); - } else { - glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW); - } + coherent = prefer_coherent; + const GLbitfield flags = + GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0); + glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); + mapped_ptr = static_cast(glMapNamedBufferRange( + gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT))); } OGLStreamBuffer::~OGLStreamBuffer() { - if (persistent) { - glUnmapNamedBuffer(gl_buffer.handle); - } + glUnmapNamedBuffer(gl_buffer.handle); gl_buffer.Release(); } @@ -63,16 +55,14 @@ std::tuple OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a buffer_pos = 0; invalidate = true; - if (persistent) { - glUnmapNamedBuffer(gl_buffer.handle); - } + glUnmapNamedBuffer(gl_buffer.handle); } - if (invalidate || !persistent) { + if (invalidate) { MICROPROFILE_SCOPE(OpenGL_StreamBuffer); - GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) | - (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) | - (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT); + const GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | + GL_MAP_INVALIDATE_BUFFER_BIT | + (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT); mapped_ptr = static_cast( glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags)); mapped_offset = buffer_pos; @@ -88,10 +78,6 @@ void OGLStreamBuffer::Unmap(GLsizeiptr size) { glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size); } - if (!persistent) { - glUnmapNamedBuffer(gl_buffer.handle); - } - buffer_pos += size; } diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index 866da35940..65c3da93f3 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -13,8 +13,7 @@ namespace OpenGL { class OGLStreamBuffer : private NonCopyable { public: - explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false, - bool use_persistent = true); + explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false); ~OGLStreamBuffer(); /* @@ -41,7 +40,6 @@ private: OGLBuffer gl_buffer; bool coherent = false; - bool persistent = false; GLintptr buffer_pos = 0; GLsizeiptr buffer_size = 0; From 00c66a728958c3b2804131ce5baf44880119e018 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Mon, 11 May 2020 16:21:08 -0300 Subject: [PATCH 138/145] gl_stream_buffer: Always use a non-coherent buffer --- .../renderer_opengl/gl_stream_buffer.cpp | 20 +++++++++---------- .../renderer_opengl/gl_stream_buffer.h | 4 +--- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 9cf0f6b46a..aeafcfbfef 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -4,6 +4,7 @@ #include #include + #include "common/alignment.h" #include "common/assert.h" #include "common/microprofile.h" @@ -14,8 +15,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", namespace OpenGL { -OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent) - : buffer_size(size) { +OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage) : buffer_size(size) { gl_buffer.Create(); GLsizeiptr allocate_size = size; @@ -28,12 +28,10 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p allocate_size *= 2; } - coherent = prefer_coherent; - const GLbitfield flags = - GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0); + static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT; glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); - mapped_ptr = static_cast(glMapNamedBufferRange( - gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT))); + mapped_ptr = static_cast( + glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); } OGLStreamBuffer::~OGLStreamBuffer() { @@ -59,10 +57,10 @@ std::tuple OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a } if (invalidate) { + static const GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | + GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_FLUSH_EXPLICIT_BIT; + MICROPROFILE_SCOPE(OpenGL_StreamBuffer); - const GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | - GL_MAP_INVALIDATE_BUFFER_BIT | - (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT); mapped_ptr = static_cast( glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags)); mapped_offset = buffer_pos; @@ -74,7 +72,7 @@ std::tuple OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a void OGLStreamBuffer::Unmap(GLsizeiptr size) { ASSERT(size <= mapped_size); - if (!coherent && size > 0) { + if (size > 0) { glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size); } diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index 65c3da93f3..826c2e3616 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -13,7 +13,7 @@ namespace OpenGL { class OGLStreamBuffer : private NonCopyable { public: - explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false); + explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage); ~OGLStreamBuffer(); /* @@ -39,8 +39,6 @@ public: private: OGLBuffer gl_buffer; - bool coherent = false; - GLintptr buffer_pos = 0; GLsizeiptr buffer_size = 0; GLintptr mapped_offset = 0; From 73fb3a304b215abce3cfb1c0c5eb2b43740b65ed Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 18 Jun 2020 03:54:13 -0300 Subject: [PATCH 139/145] gl_device: Expose NV_vertex_buffer_unified_memory except on Turing Expose NV_vertex_buffer_unified_memory when the driver supports it. This commit adds a function the determine if a GL_RENDERER is a Turing GPU. This is required because on Turing GPUs Nvidia's driver crashes when the buffer is marked as resident or on DeleteBuffers. Without a synchronous debug output (single threaded driver), it's likely that the driver will crash in the first blocking call. --- src/video_core/renderer_opengl/gl_device.cpp | 26 +++++++++++++++++++- src/video_core/renderer_opengl/gl_device.h | 5 ++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 1011c7738e..447a19595f 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -188,16 +188,32 @@ bool IsASTCSupported() { return true; } +/// @brief Returns true when a GL_RENDERER is a Turing GPU +/// @param renderer GL_RENDERER string +bool IsTuring(std::string_view renderer) { + static constexpr std::array TURING_GPUS = { + "GTX 1650", "GTX 1660", "RTX 2060", "RTX 2070", + "RTX 2080", "TITAN RTX", "Quadro RTX 3000", "Quadro RTX 4000", + "Quadro RTX 5000", "Quadro RTX 6000", "Quadro RTX 8000", "Tesla T4", + }; + return std::any_of(TURING_GPUS.begin(), TURING_GPUS.end(), + [renderer](std::string_view candidate) { + return renderer.find(candidate) != std::string_view::npos; + }); +} + } // Anonymous namespace Device::Device() : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { const std::string_view vendor = reinterpret_cast(glGetString(GL_VENDOR)); + const std::string_view renderer = reinterpret_cast(glGetString(GL_RENDERER)); const std::string_view version = reinterpret_cast(glGetString(GL_VERSION)); const std::vector extensions = GetExtensions(); const bool is_nvidia = vendor == "NVIDIA Corporation"; const bool is_amd = vendor == "ATI Technologies Inc."; + const bool is_turing = is_nvidia && IsTuring(renderer); bool disable_fast_buffer_sub_data = false; if (is_nvidia && version == "4.6.0 NVIDIA 443.24") { @@ -221,8 +237,16 @@ Device::Device() has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = is_amd; has_precise_bug = TestPreciseBug(); - has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data; has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2; + + // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive + // uniform buffers as "push constants" + has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data; + + // Nvidia's driver on Turing GPUs randomly crashes when the buffer is made resident, or on + // DeleteBuffers. Disable unified memory on these devices. + has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory && !is_turing; + use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2; diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index c86e709b1f..e1d8119666 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -72,6 +72,10 @@ public: return has_texture_shadow_lod; } + bool HasVertexBufferUnifiedMemory() const { + return has_vertex_buffer_unified_memory; + } + bool HasASTC() const { return has_astc; } @@ -115,6 +119,7 @@ private: bool has_vertex_viewport_layer{}; bool has_image_load_formatted{}; bool has_texture_shadow_lod{}; + bool has_vertex_buffer_unified_memory{}; bool has_astc{}; bool has_variable_aoffi{}; bool has_component_indexing_bug{}; From 32485917ba7cb7b2f0cad766c0897365294650a7 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Mon, 11 May 2020 16:35:04 -0300 Subject: [PATCH 140/145] gl_buffer_cache: Mark buffers as resident Make stream buffer and cached buffers as resident and query their address. This allows us to use GPU addresses for several proprietary Nvidia extensions. --- src/video_core/buffer_cache/buffer_cache.h | 21 +++++---- .../renderer_opengl/gl_buffer_cache.cpp | 24 ++++++---- .../renderer_opengl/gl_buffer_cache.h | 20 ++++++--- .../renderer_opengl/gl_rasterizer.cpp | 44 +++++++++---------- .../renderer_opengl/gl_stream_buffer.cpp | 11 ++++- .../renderer_opengl/gl_stream_buffer.h | 11 ++++- .../renderer_vulkan/vk_buffer_cache.cpp | 4 +- .../renderer_vulkan/vk_buffer_cache.h | 6 ++- .../renderer_vulkan/vk_rasterizer.cpp | 31 +++++++------ .../renderer_vulkan/vk_stream_buffer.h | 6 ++- 10 files changed, 111 insertions(+), 67 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index bae1d527cf..6ea59253ab 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -41,7 +41,11 @@ class BufferCache { static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; public: - using BufferInfo = std::pair; + struct BufferInfo { + BufferType handle; + u64 offset; + u64 address; + }; BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, bool is_written = false, bool use_fast_cbuf = false) { @@ -50,7 +54,7 @@ public: auto& memory_manager = system.GPU().MemoryManager(); const std::optional cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr); if (!cpu_addr_opt) { - return {GetEmptyBuffer(size), 0}; + return GetEmptyBuffer(size); } const VAddr cpu_addr = *cpu_addr_opt; @@ -88,7 +92,7 @@ public: Buffer* const block = GetBlock(cpu_addr, size); MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size); if (!map) { - return {GetEmptyBuffer(size), 0}; + return GetEmptyBuffer(size); } if (is_written) { map->MarkAsModified(true, GetModifiedTicks()); @@ -101,7 +105,7 @@ public: } } - return {block->Handle(), static_cast(block->Offset(cpu_addr))}; + return BufferInfo{block->Handle(), block->Offset(cpu_addr), block->Address()}; } /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. @@ -254,13 +258,12 @@ public: committed_flushes.pop_front(); } - virtual BufferType GetEmptyBuffer(std::size_t size) = 0; + virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0; protected: explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, - std::unique_ptr stream_buffer_) - : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer_)}, - stream_buffer_handle{stream_buffer->Handle()} {} + std::unique_ptr stream_buffer) + : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)} {} ~BufferCache() = default; @@ -449,7 +452,7 @@ private: buffer_ptr += size; buffer_offset += size; - return {stream_buffer_handle, uploaded_offset}; + return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()}; } void AlignBuffer(std::size_t alignment) { diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index ad0577a4f4..e09b47f578 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -22,21 +22,28 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs; MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); -Buffer::Buffer(VAddr cpu_addr, const std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} { +Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size) + : VideoCommon::BufferBlock{cpu_addr, size} { gl_buffer.Create(); glNamedBufferData(gl_buffer.handle, static_cast(size), nullptr, GL_DYNAMIC_DRAW); + if (device.HasVertexBufferUnifiedMemory()) { + glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE); + glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); + } } Buffer::~Buffer() = default; OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, - const Device& device, std::size_t stream_size) - : GenericBufferCache{rasterizer, system, std::make_unique(stream_size, true)} { + const Device& device_, std::size_t stream_size) + : GenericBufferCache{rasterizer, system, + std::make_unique(device_, stream_size, true)}, + device{device_} { if (!device.HasFastBufferSubData()) { return; } - static constexpr auto size = static_cast(Maxwell::MaxConstBufferSize); + static constexpr GLsizeiptr size = static_cast(Maxwell::MaxConstBufferSize); glCreateBuffers(static_cast(std::size(cbufs)), std::data(cbufs)); for (const GLuint cbuf : cbufs) { glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW); @@ -48,11 +55,11 @@ OGLBufferCache::~OGLBufferCache() { } std::shared_ptr OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared(cpu_addr, size); + return std::make_shared(device, cpu_addr, size); } -GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) { - return 0; +OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) { + return {0, 0, 0}; } void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, @@ -79,8 +86,9 @@ OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_poi std::size_t size) { DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); const GLuint cbuf = cbufs[cbuf_cursor++]; + glNamedBufferSubData(cbuf, 0, static_cast(size), raw_pointer); - return {cbuf, 0}; + return {cbuf, 0, 0}; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index a49aaf9c4f..6462cfae58 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -25,15 +25,20 @@ class RasterizerOpenGL; class Buffer : public VideoCommon::BufferBlock { public: - explicit Buffer(VAddr cpu_addr, const std::size_t size); + explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size); ~Buffer(); - GLuint Handle() const { + GLuint Handle() const noexcept { return gl_buffer.handle; } + u64 Address() const noexcept { + return gpu_address; + } + private: OGLBuffer gl_buffer; + u64 gpu_address = 0; }; using GenericBufferCache = VideoCommon::BufferCache; @@ -43,7 +48,7 @@ public: const Device& device, std::size_t stream_size); ~OGLBufferCache(); - GLuint GetEmptyBuffer(std::size_t) override; + BufferInfo GetEmptyBuffer(std::size_t) override; void Acquire() noexcept { cbuf_cursor = 0; @@ -64,10 +69,13 @@ protected: BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override; private: + static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * + Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; + + const Device& device; + std::size_t cbuf_cursor = 0; - std::array - cbufs; + std::array cbufs{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 2d6c11320b..7cb378a716 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -253,8 +253,8 @@ void RasterizerOpenGL::SetupVertexBuffer() { glBindVertexBuffer(static_cast(index), 0, 0, vertex_array.stride); continue; } - const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size); - glBindVertexBuffer(static_cast(index), vertex_buffer, vertex_buffer_offset, + const auto info = buffer_cache.UploadMemory(start, size); + glBindVertexBuffer(static_cast(index), info.handle, info.offset, vertex_array.stride); } } @@ -285,9 +285,9 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() { MICROPROFILE_SCOPE(OpenGL_Index); const auto& regs = system.GPU().Maxwell3D().regs; const std::size_t size = CalculateIndexBufferSize(); - const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer); - return offset; + const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle); + return info.offset; } void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { @@ -643,9 +643,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { if (!device.UseAssemblyShaders()) { MaxwellUniformData ubo; ubo.SetFromRegs(gpu); - const auto [buffer, offset] = + const auto info = buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); - glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset, + glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset, static_cast(sizeof(ubo))); } @@ -956,8 +956,7 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, if (device.UseAssemblyShaders()) { glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0); } else { - glBindBufferRange(GL_UNIFORM_BUFFER, binding, - buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float)); + glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float)); } return; } @@ -970,24 +969,25 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment(); const GPUVAddr gpu_addr = buffer.address; - auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); + auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); if (device.UseAssemblyShaders()) { UNIMPLEMENTED_IF(use_unified); - if (offset != 0) { + if (info.offset != 0) { const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; - glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); - cbuf = staging_cbuf; - offset = 0; + glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size); + info.handle = staging_cbuf; + info.offset = 0; } - glBindBufferRangeNV(stage, binding, cbuf, offset, size); + glBindBufferRangeNV(stage, binding, info.handle, info.offset, size); return; } if (use_unified) { - glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size); + glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset, + unified_offset, size); } else { - glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); + glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size); } } @@ -1023,9 +1023,8 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) { void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, std::size_t size) { const auto alignment{device.GetShaderStorageBufferAlignment()}; - const auto [ssbo, buffer_offset] = - buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, ssbo, buffer_offset, + const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset, static_cast(size)); } @@ -1712,8 +1711,9 @@ void RasterizerOpenGL::EndTransformFeedback() { const GLuint handle = transform_feedback_buffers[index].handle; const GPUVAddr gpu_addr = binding.Address(); const std::size_t size = binding.buffer_size; - const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); - glCopyNamedBufferSubData(handle, dest_buffer, 0, offset, static_cast(size)); + const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); + glCopyNamedBufferSubData(handle, info.handle, 0, info.offset, + static_cast(size)); } } diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index aeafcfbfef..164df4feba 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -2,12 +2,13 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include +#include #include #include "common/alignment.h" #include "common/assert.h" #include "common/microprofile.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", @@ -15,7 +16,8 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", namespace OpenGL { -OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage) : buffer_size(size) { +OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage) + : buffer_size(size) { gl_buffer.Create(); GLsizeiptr allocate_size = size; @@ -32,6 +34,11 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage) : buff glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); mapped_ptr = static_cast( glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); + + if (device.HasVertexBufferUnifiedMemory()) { + glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY); + glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); + } } OGLStreamBuffer::~OGLStreamBuffer() { diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index 826c2e3616..e67a82980f 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -11,9 +11,11 @@ namespace OpenGL { +class Device; + class OGLStreamBuffer : private NonCopyable { public: - explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage); + explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage); ~OGLStreamBuffer(); /* @@ -32,13 +34,18 @@ public: return gl_buffer.handle; } - GLsizeiptr Size() const { + u64 Address() const { + return gpu_address; + } + + GLsizeiptr Size() const noexcept { return buffer_size; } private: OGLBuffer gl_buffer; + GLuint64EXT gpu_address = 0; GLintptr buffer_pos = 0; GLsizeiptr buffer_size = 0; GLintptr mapped_offset = 0; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 1fde383280..df258d7a49 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -71,14 +71,14 @@ std::shared_ptr VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t s return std::make_shared(device, memory_manager, cpu_addr, size); } -VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) { +VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) { size = std::max(size, std::size_t(4)); const auto& empty = staging_pool.GetUnusedBuffer(size, false); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) { cmdbuf.FillBuffer(buffer, 0, size, 0); }); - return *empty.handle; + return {*empty.handle, 0, 0}; } void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 9ebbef8351..682383ff24 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -33,6 +33,10 @@ public: return *buffer.handle; } + u64 Address() const { + return 0; + } + private: VKBuffer buffer; }; @@ -44,7 +48,7 @@ public: VKScheduler& scheduler, VKStagingBufferPool& staging_pool); ~VKBufferCache(); - VkBuffer GetEmptyBuffer(std::size_t size) override; + BufferInfo GetEmptyBuffer(std::size_t size) override; protected: std::shared_ptr CreateBlock(VAddr cpu_addr, std::size_t size) override; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 29001953c8..e3714ee6db 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -870,10 +870,10 @@ void RasterizerVulkan::BeginTransformFeedback() { UNIMPLEMENTED_IF(binding.buffer_offset != 0); const GPUVAddr gpu_addr = binding.Address(); - const auto size = static_cast(binding.buffer_size); - const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); + const VkDeviceSize size = static_cast(binding.buffer_size); + const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); - scheduler.Record([buffer = buffer, offset = offset, size](vk::CommandBuffer cmdbuf) { + scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) { cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size); cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); }); @@ -925,8 +925,8 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex buffer_bindings.AddVertexBinding(DefaultBuffer(), 0); continue; } - const auto [buffer, offset] = buffer_cache.UploadMemory(start, size); - buffer_bindings.AddVertexBinding(buffer, offset); + const auto info = buffer_cache.UploadMemory(start, size); + buffer_bindings.AddVertexBinding(info.handle, info.offset); } } @@ -948,7 +948,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar break; } const GPUVAddr gpu_addr = regs.index_array.IndexStart(); - auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); + const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); + VkBuffer buffer = info.handle; + u64 offset = info.offset; std::tie(buffer, offset) = quad_indexed_pass.Assemble( regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset); @@ -962,7 +964,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar break; } const GPUVAddr gpu_addr = regs.index_array.IndexStart(); - auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); + const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); + VkBuffer buffer = info.handle; + u64 offset = info.offset; auto format = regs.index_array.format; const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte; @@ -1109,10 +1113,9 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry, Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float)); ASSERT(size <= MaxConstbufferSize); - const auto [buffer_handle, offset] = + const auto info = buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment()); - - update_descriptor_queue.AddBuffer(buffer_handle, offset, size); + update_descriptor_queue.AddBuffer(info.handle, info.offset, size); } void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) { @@ -1126,14 +1129,14 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the // default buffer. static constexpr std::size_t dummy_size = 4; - const auto buffer = buffer_cache.GetEmptyBuffer(dummy_size); - update_descriptor_queue.AddBuffer(buffer, 0, dummy_size); + const auto info = buffer_cache.GetEmptyBuffer(dummy_size); + update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size); return; } - const auto [buffer, offset] = buffer_cache.UploadMemory( + const auto info = buffer_cache.UploadMemory( actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten()); - update_descriptor_queue.AddBuffer(buffer, offset, size); + update_descriptor_queue.AddBuffer(info.handle, info.offset, size); } void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic, diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h index c765c60a06..689f0d276e 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.h +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h @@ -35,10 +35,14 @@ public: /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy. void Unmap(u64 size); - VkBuffer Handle() const { + VkBuffer Handle() const noexcept { return *buffer; } + u64 Address() const noexcept { + return 0; + } + private: struct Watch final { VKFenceWatch fence; From 41a4090320ee52e914e8b4c789dfe14210794fed Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 18 Jun 2020 03:56:31 -0300 Subject: [PATCH 141/145] gl_rasterizer: Use NV_vertex_buffer_unified_memory for vertex buffer robustness Switch games are allowed to bind less data than what they use in a vertex buffer, the expected behavior here is that these values are read as zero. At the moment of writing this only D3D12, OpenGL and NVN through NV_vertex_buffer_unified_memory support vertex buffer with a size limit. In theory this could be emulated on Vulkan creating a new VkBuffer for each (handle, offset, length) tuple and binding the expected data to it. This is likely going to be slow and memory expensive when used on the vertex buffer and we have to do it on all draws because we can't know without analyzing indices when a game is going to read vertex data out of bounds. This is not a problem on OpenGL's BufferAddressRangeNV because it takes a length parameter, unlike Vulkan's CmdBindVertexBuffers that only takes buffers and offsets (the length is implicit in VkBuffer). It isn't a problem on D3D12 either, because D3D12_VERTEX_BUFFER_VIEW on IASetVertexBuffers takes SizeInBytes as a parameter (although I am not familiar with robustness on D3D12). Currently this only implements buffer ranges for vertex buffers, although indices can also be affected. A KHR_robustness profile is not created, but Nvidia's driver reads out of bound vertex data as zero anyway, this might have to be changed in the future. - Fixes SMO random triangles when capturing an enemy, getting hit, or looking at the environment on certain maps. --- .../renderer_opengl/gl_rasterizer.cpp | 28 +++++++++++++------ .../renderer_opengl/renderer_opengl.cpp | 17 ++++++++++- .../renderer_opengl/renderer_opengl.h | 3 ++ 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 7cb378a716..362457ffe2 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -61,7 +61,8 @@ constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE = constexpr std::size_t TOTAL_CONST_BUFFER_BYTES = NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage; -constexpr std::size_t NumSupportedVertexAttributes = 16; +constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16; +constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16; template Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, @@ -193,7 +194,7 @@ void RasterizerOpenGL::SetupVertexFormat() { // avoid OpenGL errors. // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't // assume every shader uses them all. - for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { + for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) { if (!flags[Dirty::VertexFormat0 + index]) { continue; } @@ -231,9 +232,11 @@ void RasterizerOpenGL::SetupVertexBuffer() { MICROPROFILE_SCOPE(OpenGL_VB); + const bool use_unified_memory = device.HasVertexBufferUnifiedMemory(); + // Upload all guest vertex arrays sequentially to our buffer const auto& regs = gpu.regs; - for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { + for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) { if (!flags[Dirty::VertexBuffer0 + index]) { continue; } @@ -246,16 +249,25 @@ void RasterizerOpenGL::SetupVertexBuffer() { const GPUVAddr start = vertex_array.StartAddress(); const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); - ASSERT(end >= start); + + const GLuint gl_index = static_cast(index); const u64 size = end - start; if (size == 0) { - glBindVertexBuffer(static_cast(index), 0, 0, vertex_array.stride); + glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); + if (use_unified_memory) { + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0); + } continue; } const auto info = buffer_cache.UploadMemory(start, size); - glBindVertexBuffer(static_cast(index), info.handle, info.offset, - vertex_array.stride); + if (use_unified_memory) { + glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, + info.address + info.offset, size); + } else { + glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride); + } } } @@ -268,7 +280,7 @@ void RasterizerOpenGL::SetupVertexInstances() { flags[Dirty::VertexInstances] = false; const auto& regs = gpu.regs; - for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { + for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) { if (!flags[Dirty::VertexInstance0 + index]) { continue; } diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 6214fcbc3d..c40adb6e77 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -488,6 +488,15 @@ void RendererOpenGL::InitOpenGLObjects() { // Clear screen to black LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture); + + // Enable unified vertex attributes and query vertex buffer address when the driver supports it + if (device.HasVertexBufferUnifiedMemory()) { + glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); + + glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY); + glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, + &vertex_buffer_address); + } } void RendererOpenGL::AddTelemetryFields() { @@ -656,7 +665,13 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { offsetof(ScreenRectVertex, tex_coord)); glVertexAttribBinding(PositionLocation, 0); glVertexAttribBinding(TexCoordLocation, 0); - glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); + if (device.HasVertexBufferUnifiedMemory()) { + glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex)); + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address, + sizeof(vertices)); + } else { + glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); + } glBindTextureUnit(0, screen_info.display_texture); glBindSampler(0, 0); diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 61bf507f41..8b18d32e62 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -107,6 +107,9 @@ private: OGLPipeline pipeline; OGLFramebuffer screenshot_framebuffer; + // GPU address of the vertex buffer + GLuint64EXT vertex_buffer_address = 0; + /// Display information for Switch screen ScreenInfo screen_info; From 39c97f1b652898dbd0e5e6d028de2ba4b9fa94a0 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 18 Jun 2020 21:53:47 -0300 Subject: [PATCH 142/145] gl_stream_buffer: Use InvalidateBufferData instead unmap and map Making the stream buffer resident increases GPU usage significantly on some games. This seems to be addressed invalidating the stream buffer with InvalidateBufferData instead of using a Unmap + Map (with invalidation flags). --- .../renderer_opengl/gl_stream_buffer.cpp | 19 +++++-------------- .../renderer_opengl/gl_stream_buffer.h | 1 - 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 164df4feba..3655ff6292 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -57,30 +57,21 @@ std::tuple OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a bool invalidate = false; if (buffer_pos + size > buffer_size) { + MICROPROFILE_SCOPE(OpenGL_StreamBuffer); + glInvalidateBufferData(gl_buffer.handle); + buffer_pos = 0; invalidate = true; - - glUnmapNamedBuffer(gl_buffer.handle); } - if (invalidate) { - static const GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | - GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_FLUSH_EXPLICIT_BIT; - - MICROPROFILE_SCOPE(OpenGL_StreamBuffer); - mapped_ptr = static_cast( - glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags)); - mapped_offset = buffer_pos; - } - - return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate); + return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate); } void OGLStreamBuffer::Unmap(GLsizeiptr size) { ASSERT(size <= mapped_size); if (size > 0) { - glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size); + glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size); } buffer_pos += size; diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index e67a82980f..307a67113d 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -48,7 +48,6 @@ private: GLuint64EXT gpu_address = 0; GLintptr buffer_pos = 0; GLsizeiptr buffer_size = 0; - GLintptr mapped_offset = 0; GLsizeiptr mapped_size = 0; u8* mapped_ptr = nullptr; }; From 32a2dcd4153f4e2aea7b5f88c85d8a352f647f12 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Fri, 19 Jun 2020 20:47:48 -0300 Subject: [PATCH 143/145] buffer_cache: Use buffer methods instead of cache virtual methods --- src/video_core/buffer_cache/buffer_cache.h | 23 ++--- .../renderer_opengl/gl_buffer_cache.cpp | 38 ++++---- .../renderer_opengl/gl_buffer_cache.h | 16 ++-- .../renderer_vulkan/vk_buffer_cache.cpp | 89 ++++++++++--------- .../renderer_vulkan/vk_buffer_cache.h | 23 ++--- 5 files changed, 90 insertions(+), 99 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 6ea59253ab..cf8bdd0212 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -269,15 +269,6 @@ protected: virtual std::shared_ptr CreateBlock(VAddr cpu_addr, std::size_t size) = 0; - virtual void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) = 0; - - virtual void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) = 0; - - virtual void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) = 0; - virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { return {}; } @@ -339,11 +330,11 @@ private: const VAddr cpu_addr_end = cpu_addr + size; if (memory_manager.IsGranularRange(gpu_addr, size)) { u8* host_ptr = memory_manager.GetPointer(gpu_addr); - UploadBlockData(*block, block->Offset(cpu_addr), size, host_ptr); + block->Upload(block->Offset(cpu_addr), size, host_ptr); } else { staging_buffer.resize(size); memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); - UploadBlockData(*block, block->Offset(cpu_addr), size, staging_buffer.data()); + block->Upload(block->Offset(cpu_addr), size, staging_buffer.data()); } return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr)); } @@ -402,7 +393,7 @@ private: } staging_buffer.resize(size); system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); - UploadBlockData(*block, block->Offset(interval.lower()), size, staging_buffer.data()); + block->Upload(block->Offset(interval.lower()), size, staging_buffer.data()); } } @@ -439,7 +430,7 @@ private: const std::size_t size = map->end - map->start; staging_buffer.resize(size); - DownloadBlockData(*block, block->Offset(map->start), size, staging_buffer.data()); + block->Download(block->Offset(map->start), size, staging_buffer.data()); system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size); map->MarkAsModified(false, 0); } @@ -467,7 +458,7 @@ private: const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; const VAddr cpu_addr = buffer->CpuAddr(); std::shared_ptr new_buffer = CreateBlock(cpu_addr, new_size); - CopyBlock(*buffer, *new_buffer, 0, 0, old_size); + new_buffer->CopyFrom(*buffer, 0, 0, old_size); QueueDestruction(std::move(buffer)); const VAddr cpu_addr_end = cpu_addr + new_size - 1; @@ -489,8 +480,8 @@ private: const std::size_t new_size = size_1 + size_2; std::shared_ptr new_buffer = CreateBlock(new_addr, new_size); - CopyBlock(*first, *new_buffer, 0, new_buffer->Offset(first_addr), size_1); - CopyBlock(*second, *new_buffer, 0, new_buffer->Offset(second_addr), size_2); + new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1); + new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2); QueueDestruction(std::move(first)); QueueDestruction(std::move(second)); diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index e09b47f578..d9f7b4cc60 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -34,6 +34,24 @@ Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size) Buffer::~Buffer() = default; +void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const { + glNamedBufferSubData(Handle(), static_cast(offset), static_cast(size), + data); +} + +void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const { + MICROPROFILE_SCOPE(OpenGL_Buffer_Download); + glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); + glGetNamedBufferSubData(Handle(), static_cast(offset), static_cast(size), + data); +} + +void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size) const { + glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast(src_offset), + static_cast(dst_offset), static_cast(size)); +} + OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, const Device& device_, std::size_t stream_size) : GenericBufferCache{rasterizer, system, @@ -62,26 +80,6 @@ OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) { return {0, 0, 0}; } -void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) { - glNamedBufferSubData(buffer.Handle(), static_cast(offset), - static_cast(size), data); -} - -void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) { - MICROPROFILE_SCOPE(OpenGL_Buffer_Download); - glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); - glGetNamedBufferSubData(buffer.Handle(), static_cast(offset), - static_cast(size), data); -} - -void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) { - glCopyNamedBufferSubData(src.Handle(), dst.Handle(), static_cast(src_offset), - static_cast(dst_offset), static_cast(size)); -} - OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, std::size_t size) { DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index 6462cfae58..59d95adbc4 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -28,6 +28,13 @@ public: explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size); ~Buffer(); + void Upload(std::size_t offset, std::size_t size, const u8* data) const; + + void Download(std::size_t offset, std::size_t size, u8* data) const; + + void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size) const; + GLuint Handle() const noexcept { return gl_buffer.handle; } @@ -57,15 +64,6 @@ public: protected: std::shared_ptr CreateBlock(VAddr cpu_addr, std::size_t size) override; - void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) override; - - void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) override; - - void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) override; - BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override; private: diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index df258d7a49..f10f96cd89 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -37,9 +37,9 @@ std::unique_ptr CreateStreamBuffer(const VKDevice& device, VKSch } // Anonymous namespace -Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr, - std::size_t size) - : VideoCommon::BufferBlock{cpu_addr, size} { +Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler_, + VKStagingBufferPool& staging_pool_, VAddr cpu_addr, std::size_t size) + : VideoCommon::BufferBlock{cpu_addr, size}, scheduler{scheduler_}, staging_pool{staging_pool_} { VkBufferCreateInfo ci; ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; ci.pNext = nullptr; @@ -56,40 +56,15 @@ Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cp Buffer::~Buffer() = default; -VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, - const VKDevice& device, VKMemoryManager& memory_manager, - VKScheduler& scheduler, VKStagingBufferPool& staging_pool) - : VideoCommon::BufferCache{rasterizer, system, - CreateStreamBuffer(device, - scheduler)}, - device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{ - staging_pool} {} - -VKBufferCache::~VKBufferCache() = default; - -std::shared_ptr VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared(device, memory_manager, cpu_addr, size); -} - -VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) { - size = std::max(size, std::size_t(4)); - const auto& empty = staging_pool.GetUnusedBuffer(size, false); - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) { - cmdbuf.FillBuffer(buffer, 0, size, 0); - }); - return {*empty.handle, 0, 0}; -} - -void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) { +void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const { const auto& staging = staging_pool.GetUnusedBuffer(size, true); std::memcpy(staging.commit->Map(size), data, size); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset, - size](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size}); + + const VkBuffer handle = Handle(); + scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) { + cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, size}); VkBufferMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; @@ -98,7 +73,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st barrier.dstAccessMask = UPLOAD_ACCESS_BARRIERS; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = buffer; + barrier.buffer = handle; barrier.offset = offset; barrier.size = size; cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {}, @@ -106,12 +81,12 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st }); } -void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) { +void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const { const auto& staging = staging_pool.GetUnusedBuffer(size, true); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset, - size](vk::CommandBuffer cmdbuf) { + + const VkBuffer handle = Handle(); + scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) { VkBufferMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barrier.pNext = nullptr; @@ -119,7 +94,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = buffer; + barrier.buffer = handle; barrier.offset = offset; barrier.size = size; @@ -127,17 +102,19 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {}); - cmdbuf.CopyBuffer(buffer, staging, VkBufferCopy{offset, 0, size}); + cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, size}); }); scheduler.Finish(); std::memcpy(data, staging.commit->Map(size), size); } -void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) { +void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size) const { scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([src_buffer = src.Handle(), dst_buffer = dst.Handle(), src_offset, dst_offset, + + const VkBuffer dst_buffer = Handle(); + scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset, size](vk::CommandBuffer cmdbuf) { cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size}); @@ -165,4 +142,30 @@ void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t }); } +VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, + const VKDevice& device, VKMemoryManager& memory_manager, + VKScheduler& scheduler, VKStagingBufferPool& staging_pool) + : VideoCommon::BufferCache{rasterizer, system, + CreateStreamBuffer(device, + scheduler)}, + device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{ + staging_pool} {} + +VKBufferCache::~VKBufferCache() = default; + +std::shared_ptr VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { + return std::make_shared(device, memory_manager, scheduler, staging_pool, cpu_addr, + size); +} + +VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) { + size = std::max(size, std::size_t(4)); + const auto& empty = staging_pool.GetUnusedBuffer(size, false); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, size, 0); + }); + return {*empty.handle, 0, 0}; +} + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 682383ff24..3630aca77f 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -25,10 +25,17 @@ class VKScheduler; class Buffer final : public VideoCommon::BufferBlock { public: - explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr, - std::size_t size); + explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler, + VKStagingBufferPool& staging_pool, VAddr cpu_addr, std::size_t size); ~Buffer(); + void Upload(std::size_t offset, std::size_t size, const u8* data) const; + + void Download(std::size_t offset, std::size_t size, u8* data) const; + + void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size) const; + VkBuffer Handle() const { return *buffer.handle; } @@ -38,6 +45,9 @@ public: } private: + VKScheduler& scheduler; + VKStagingBufferPool& staging_pool; + VKBuffer buffer; }; @@ -53,15 +63,6 @@ public: protected: std::shared_ptr CreateBlock(VAddr cpu_addr, std::size_t size) override; - void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) override; - - void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) override; - - void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) override; - private: const VKDevice& device; VKMemoryManager& memory_manager; From 2c9308954c0a7cc7e8f9b12508ff28f570ffe3e9 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Mon, 22 Jun 2020 05:37:46 -0400 Subject: [PATCH 144/145] hid: Stub a series of "SevenSixAxisSensor" service commands - Used by Captain Toad: Treasure Tracker Update 1.3.0 While we're at it, fix the input parameters for SetIsPalmaAllConnectable and SetPalmaBoostMode --- src/core/hle/service/hid/hid.cpp | 93 ++++++++++++++++++++++++++------ src/core/hle/service/hid/hid.h | 13 +++-- 2 files changed, 85 insertions(+), 21 deletions(-) diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp index 72a050de25..1ee99bfeb5 100644 --- a/src/core/hle/service/hid/hid.cpp +++ b/src/core/hle/service/hid/hid.cpp @@ -230,15 +230,15 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) { {211, nullptr, "IsVibrationDeviceMounted"}, {300, &Hid::ActivateConsoleSixAxisSensor, "ActivateConsoleSixAxisSensor"}, {301, &Hid::StartConsoleSixAxisSensor, "StartConsoleSixAxisSensor"}, - {302, nullptr, "StopConsoleSixAxisSensor"}, - {303, nullptr, "ActivateSevenSixAxisSensor"}, - {304, nullptr, "StartSevenSixAxisSensor"}, + {302, &Hid::StopConsoleSixAxisSensor, "StopConsoleSixAxisSensor"}, + {303, &Hid::ActivateSevenSixAxisSensor, "ActivateSevenSixAxisSensor"}, + {304, &Hid::StartSevenSixAxisSensor, "StartSevenSixAxisSensor"}, {305, &Hid::StopSevenSixAxisSensor, "StopSevenSixAxisSensor"}, {306, &Hid::InitializeSevenSixAxisSensor, "InitializeSevenSixAxisSensor"}, - {307, nullptr, "FinalizeSevenSixAxisSensor"}, + {307, &Hid::FinalizeSevenSixAxisSensor, "FinalizeSevenSixAxisSensor"}, {308, nullptr, "SetSevenSixAxisSensorFusionStrength"}, {309, nullptr, "GetSevenSixAxisSensorFusionStrength"}, - {310, nullptr, "ResetSevenSixAxisSensorTimestamp"}, + {310, &Hid::ResetSevenSixAxisSensorTimestamp, "ResetSevenSixAxisSensorTimestamp"}, {400, nullptr, "IsUsbFullKeyControllerEnabled"}, {401, nullptr, "EnableUsbFullKeyController"}, {402, nullptr, "IsUsbFullKeyControllerConnected"}, @@ -374,6 +374,15 @@ void Hid::ActivateKeyboard(Kernel::HLERequestContext& ctx) { rb.Push(RESULT_SUCCESS); } +void Hid::SendKeyboardLockKeyEvent(Kernel::HLERequestContext& ctx) { + IPC::RequestParser rp{ctx}; + const auto flags{rp.Pop()}; + LOG_WARNING(Service_HID, "(STUBBED) called. flags={}", flags); + + IPC::ResponseBuilder rb{ctx, 2}; + rb.Push(RESULT_SUCCESS); +} + void Hid::ActivateGesture(Kernel::HLERequestContext& ctx) { IPC::RequestParser rp{ctx}; const auto unknown{rp.Pop()}; @@ -413,6 +422,18 @@ void Hid::StartSixAxisSensor(Kernel::HLERequestContext& ctx) { rb.Push(RESULT_SUCCESS); } +void Hid::StopSixAxisSensor(Kernel::HLERequestContext& ctx) { + IPC::RequestParser rp{ctx}; + const auto handle{rp.Pop()}; + const auto applet_resource_user_id{rp.Pop()}; + + LOG_WARNING(Service_HID, "(STUBBED) called, handle={}, applet_resource_user_id={}", handle, + applet_resource_user_id); + + IPC::ResponseBuilder rb{ctx, 2}; + rb.Push(RESULT_SUCCESS); +} + void Hid::SetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx) { IPC::RequestParser rp{ctx}; const auto handle{rp.Pop()}; @@ -832,33 +853,35 @@ void Hid::StartConsoleSixAxisSensor(Kernel::HLERequestContext& ctx) { rb.Push(RESULT_SUCCESS); } -void Hid::StopSixAxisSensor(Kernel::HLERequestContext& ctx) { +void Hid::StopConsoleSixAxisSensor(Kernel::HLERequestContext& ctx) { IPC::RequestParser rp{ctx}; const auto handle{rp.Pop()}; + const auto applet_resource_user_id{rp.Pop()}; - LOG_WARNING(Service_HID, "(STUBBED) called, handle={}", handle); + LOG_WARNING(Service_HID, "(STUBBED) called, handle={}, applet_resource_user_id={}", handle, + applet_resource_user_id); IPC::ResponseBuilder rb{ctx, 2}; rb.Push(RESULT_SUCCESS); } -void Hid::SetIsPalmaAllConnectable(Kernel::HLERequestContext& ctx) { +void Hid::ActivateSevenSixAxisSensor(Kernel::HLERequestContext& ctx) { IPC::RequestParser rp{ctx}; const auto applet_resource_user_id{rp.Pop()}; - const auto unknown{rp.Pop()}; - LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}, unknown={}", - applet_resource_user_id, unknown); + LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}", + applet_resource_user_id); IPC::ResponseBuilder rb{ctx, 2}; rb.Push(RESULT_SUCCESS); } -void Hid::SetPalmaBoostMode(Kernel::HLERequestContext& ctx) { +void Hid::StartSevenSixAxisSensor(Kernel::HLERequestContext& ctx) { IPC::RequestParser rp{ctx}; - const auto unknown{rp.Pop()}; + const auto applet_resource_user_id{rp.Pop()}; - LOG_WARNING(Service_HID, "(STUBBED) called, unknown={}", unknown); + LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}", + applet_resource_user_id); IPC::ResponseBuilder rb{ctx, 2}; rb.Push(RESULT_SUCCESS); @@ -882,10 +905,46 @@ void Hid::InitializeSevenSixAxisSensor(Kernel::HLERequestContext& ctx) { rb.Push(RESULT_SUCCESS); } -void Hid::SendKeyboardLockKeyEvent(Kernel::HLERequestContext& ctx) { +void Hid::FinalizeSevenSixAxisSensor(Kernel::HLERequestContext& ctx) { IPC::RequestParser rp{ctx}; - const auto flags{rp.Pop()}; - LOG_WARNING(Service_HID, "(STUBBED) called. flags={}", flags); + const auto applet_resource_user_id{rp.Pop()}; + + LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}", + applet_resource_user_id); + + IPC::ResponseBuilder rb{ctx, 2}; + rb.Push(RESULT_SUCCESS); +} + +void Hid::ResetSevenSixAxisSensorTimestamp(Kernel::HLERequestContext& ctx) { + IPC::RequestParser rp{ctx}; + const auto applet_resource_user_id{rp.Pop()}; + + LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}", + applet_resource_user_id); + + IPC::ResponseBuilder rb{ctx, 2}; + rb.Push(RESULT_SUCCESS); +} + +void Hid::SetIsPalmaAllConnectable(Kernel::HLERequestContext& ctx) { + IPC::RequestParser rp{ctx}; + const auto applet_resource_user_id{rp.Pop()}; + const auto is_palma_all_connectable{rp.Pop()}; + + LOG_WARNING(Service_HID, + "(STUBBED) called, applet_resource_user_id={}, is_palma_all_connectable={}", + applet_resource_user_id, is_palma_all_connectable); + + IPC::ResponseBuilder rb{ctx, 2}; + rb.Push(RESULT_SUCCESS); +} + +void Hid::SetPalmaBoostMode(Kernel::HLERequestContext& ctx) { + IPC::RequestParser rp{ctx}; + const auto palma_boost_mode{rp.Pop()}; + + LOG_WARNING(Service_HID, "(STUBBED) called, palma_boost_mode={}", palma_boost_mode); IPC::ResponseBuilder rb{ctx, 2}; rb.Push(RESULT_SUCCESS); diff --git a/src/core/hle/service/hid/hid.h b/src/core/hle/service/hid/hid.h index d481a75f85..ffce5ba458 100644 --- a/src/core/hle/service/hid/hid.h +++ b/src/core/hle/service/hid/hid.h @@ -91,9 +91,11 @@ private: void ActivateTouchScreen(Kernel::HLERequestContext& ctx); void ActivateMouse(Kernel::HLERequestContext& ctx); void ActivateKeyboard(Kernel::HLERequestContext& ctx); + void SendKeyboardLockKeyEvent(Kernel::HLERequestContext& ctx); void ActivateGesture(Kernel::HLERequestContext& ctx); void ActivateNpadWithRevision(Kernel::HLERequestContext& ctx); void StartSixAxisSensor(Kernel::HLERequestContext& ctx); + void StopSixAxisSensor(Kernel::HLERequestContext& ctx); void SetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx); void IsSixAxisSensorAtRest(Kernel::HLERequestContext& ctx); void SetSupportedNpadStyleSet(Kernel::HLERequestContext& ctx); @@ -126,12 +128,15 @@ private: void IsVibrationPermitted(Kernel::HLERequestContext& ctx); void ActivateConsoleSixAxisSensor(Kernel::HLERequestContext& ctx); void StartConsoleSixAxisSensor(Kernel::HLERequestContext& ctx); - void StopSixAxisSensor(Kernel::HLERequestContext& ctx); - void SetIsPalmaAllConnectable(Kernel::HLERequestContext& ctx); - void SetPalmaBoostMode(Kernel::HLERequestContext& ctx); + void StopConsoleSixAxisSensor(Kernel::HLERequestContext& ctx); + void ActivateSevenSixAxisSensor(Kernel::HLERequestContext& ctx); + void StartSevenSixAxisSensor(Kernel::HLERequestContext& ctx); void StopSevenSixAxisSensor(Kernel::HLERequestContext& ctx); void InitializeSevenSixAxisSensor(Kernel::HLERequestContext& ctx); - void SendKeyboardLockKeyEvent(Kernel::HLERequestContext& ctx); + void FinalizeSevenSixAxisSensor(Kernel::HLERequestContext& ctx); + void ResetSevenSixAxisSensorTimestamp(Kernel::HLERequestContext& ctx); + void SetIsPalmaAllConnectable(Kernel::HLERequestContext& ctx); + void SetPalmaBoostMode(Kernel::HLERequestContext& ctx); std::shared_ptr applet_resource; Core::System& system; From a927d8be52c343bc1025e5df822c56470eb27919 Mon Sep 17 00:00:00 2001 From: David Marcec Date: Thu, 25 Jun 2020 19:12:56 +1000 Subject: [PATCH 145/145] gl_device: Fix IsASTCSupported Other targets were never actually checked --- src/video_core/renderer_opengl/gl_device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 447a19595f..b6b6659c1d 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -178,7 +178,7 @@ bool IsASTCSupported() { for (const GLenum format : formats) { for (const GLenum support : required_support) { GLint value; - glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value); + glGetInternalformativ(target, format, support, 1, &value); if (value != GL_FULL_SUPPORT) { return false; }