From 45d0e821dcdfd7c286d83b167c87440e7a420c46 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Thu, 14 Sep 2017 14:37:14 +0300 Subject: [PATCH] gl: Minor optimizations rsx: Texture cache - improvements to locking rsx: Minor optimizations to get_current_vertex_program and begin-end batch flushes rsx: Optimize texture cache storage - Manages storage in blocks of 16MB rsx/vk/gl: Fix swizzled texture input gl: Hotfix for compressed texture formats --- rpcs3/Emu/RSX/Common/texture_cache.h | 398 ++++++++++++++------------- rpcs3/Emu/RSX/GL/GLGSRender.cpp | 11 +- rpcs3/Emu/RSX/GL/GLRenderTargets.cpp | 14 + rpcs3/Emu/RSX/GL/GLTexture.cpp | 32 ++- rpcs3/Emu/RSX/GL/GLTexture.h | 8 + rpcs3/Emu/RSX/GL/GLTextureCache.h | 45 ++- rpcs3/Emu/RSX/RSXThread.cpp | 112 ++++---- rpcs3/Emu/RSX/VK/VKTextureCache.h | 7 +- rpcs3/Emu/RSX/rsx_cache.h | 12 +- 9 files changed, 372 insertions(+), 267 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index 97878ad4b6..ef4f544e6e 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -14,6 +14,12 @@ namespace rsx swapped_native_component_order = 2, }; + enum texture_upload_context + { + shader_read = 0, + blit_engine_src = 1 + }; + template class texture_cache { @@ -34,12 +40,14 @@ namespace rsx void notify(u32 data_size) { + verify(HERE), valid_count >= 0; max_range = std::max(data_size, max_range); valid_count++; } void add(section_storage_type& section, u32 data_size) { + verify(HERE), valid_count >= 0; max_range = std::max(data_size, max_range); valid_count++; @@ -65,7 +73,7 @@ namespace rsx std::unordered_map m_cache_miss_statistics_table; //Memory usage - const s32 m_max_zombie_objects = 32; //Limit on how many texture objects to keep around for reuse after they are invalidated + const s32 m_max_zombie_objects = 128; //Limit on how many texture objects to keep around for reuse after they are invalidated s32 m_unreleased_texture_objects = 0; //Number of invalidated objects not yet freed from memory /* Helpers */ @@ -74,11 +82,141 @@ namespace rsx virtual image_view_type create_temporary_subresource_view(commandbuffer_type&, image_storage_type* src, u32 gcm_format, u16 x, u16 y, u16 w, u16 h) = 0; virtual section_storage_type* create_new_texture(commandbuffer_type&, u32 rsx_address, u32 rsx_size, u16 width, u16 height, u16 depth, u16 mipmaps, const u32 gcm_format, const rsx::texture_dimension_extended type, const texture_create_flags flags, std::pair, std::array>& remap_vector) = 0; - virtual section_storage_type* upload_image_from_cpu(commandbuffer_type&, u32 rsx_address, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, const u32 gcm_format, + virtual section_storage_type* upload_image_from_cpu(commandbuffer_type&, u32 rsx_address, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, const u32 gcm_format, const texture_upload_context context, std::vector& subresource_layout, const rsx::texture_dimension_extended type, const bool swizzled, std::pair, std::array>& remap_vector) = 0; virtual void enforce_surface_creation_type(section_storage_type& section, const texture_create_flags expected) = 0; virtual void insert_texture_barrier() = 0; + private: + //Internal implementation methods + bool invalidate_range_impl(u32 address, u32 range, bool unprotect) + { + bool response = false; + u32 last_dirty_block = 0; + std::pair trampled_range = std::make_pair(address, address + range); + + for (auto It = m_cache.begin(); It != m_cache.end(); It++) + { + auto &range_data = It->second; + const u32 base = It->first; + bool range_reset = false; + + if (base == last_dirty_block && range_data.valid_count == 0) + continue; + + if (trampled_range.first >= (base + get_block_size()) || base >= trampled_range.second) + continue; + + for (int i = 0; i < range_data.data.size(); i++) + { + auto &tex = range_data.data[i]; + + if (tex.is_dirty()) continue; + if (!tex.is_locked()) continue; //flushable sections can be 'clean' but unlocked. TODO: Handle this better + + auto overlapped = tex.overlaps_page(trampled_range, address); + if (std::get<0>(overlapped)) + { + auto &new_range = std::get<1>(overlapped); + + if (new_range.first != trampled_range.first || + new_range.second != trampled_range.second) + { + i = 0; + trampled_range = new_range; + range_reset = true; + } + + if (unprotect) + { + tex.set_dirty(true); + tex.unprotect(); + } + else + { + tex.discard(); + } + + m_unreleased_texture_objects++; + range_data.valid_count--; + response = true; + } + } + + if (range_reset) + { + last_dirty_block = base; + It = m_cache.begin(); + } + } + + return response; + } + + template + bool flush_address_impl(u32 address, Args&&... extras) + { + bool response = false; + u32 last_dirty_block = 0; + std::pair trampled_range = std::make_pair(0xffffffff, 0x0); + + for (auto It = m_cache.begin(); It != m_cache.end(); It++) + { + auto &range_data = It->second; + const u32 base = It->first; + bool range_reset = false; + + if (base == last_dirty_block && range_data.valid_count == 0) + continue; + + if (trampled_range.first >= (base + get_block_size()) || base >= trampled_range.second) + continue; + + for (int i = 0; i < range_data.data.size(); i++) + { + auto &tex = range_data.data[i]; + + if (tex.is_dirty()) continue; + if (!tex.is_flushable()) continue; + + auto overlapped = tex.overlaps_page(trampled_range, address); + if (std::get<0>(overlapped)) + { + auto &new_range = std::get<1>(overlapped); + + if (new_range.first != trampled_range.first || + new_range.second != trampled_range.second) + { + i = 0; + trampled_range = new_range; + range_reset = true; + } + + //TODO: Map basic host_visible memory without coherent constraint + if (!tex.flush(std::forward(extras)...)) + { + //Missed address, note this + //TODO: Lower severity when successful to keep the cache from overworking + record_cache_miss(tex); + } + + response = true; + range_data.valid_count--; + } + } + + if (range_reset) + { + It = m_cache.begin(); + } + } + + return response; + } + + constexpr u32 get_block_size() const { return 0x1000000; } + inline u32 get_block_address(u32 address) const { return (address & ~0xFFFFFF); } + public: texture_cache() {} @@ -93,7 +231,9 @@ namespace rsx auto test = std::make_pair(rsx_address, range); for (auto &address_range : m_cache) { + if (address_range.second.valid_count == 0) continue; auto &range_data = address_range.second; + for (auto &tex : range_data.data) { if (tex.get_section_base() > rsx_address) @@ -109,7 +249,7 @@ namespace rsx section_storage_type *find_texture_from_dimensions(u32 rsx_address, u16 width = 0, u16 height = 0, u16 mipmaps = 0) { - auto found = m_cache.find(rsx_address); + auto found = m_cache.find(get_block_address(rsx_address)); if (found != m_cache.end()) { auto &range_data = found->second; @@ -127,59 +267,53 @@ namespace rsx section_storage_type& find_cached_texture(u32 rsx_address, u32 rsx_size, bool confirm_dimensions = false, u16 width = 0, u16 height = 0, u16 mipmaps = 0) { + const u32 block_address = get_block_address(rsx_address); + + auto found = m_cache.find(block_address); + if (found != m_cache.end()) { - reader_lock lock(m_cache_mutex); + auto &range_data = found->second; - auto found = m_cache.find(rsx_address); - if (found != m_cache.end()) + for (auto &tex : range_data.data) { - auto &range_data = found->second; - - for (auto &tex : range_data.data) + if (tex.matches(rsx_address, rsx_size) && !tex.is_dirty()) { - if (tex.matches(rsx_address, rsx_size) && !tex.is_dirty()) - { - if (!confirm_dimensions) return tex; + if (!confirm_dimensions) return tex; - if (tex.matches(rsx_address, width, height, mipmaps)) - return tex; - else - { - LOG_ERROR(RSX, "Cached object for address 0x%X was found, but it does not match stored parameters.", rsx_address); - LOG_ERROR(RSX, "%d x %d vs %d x %d", width, height, tex.get_width(), tex.get_height()); - } + if (tex.matches(rsx_address, width, height, mipmaps)) + return tex; + else + { + LOG_ERROR(RSX, "Cached object for address 0x%X was found, but it does not match stored parameters.", rsx_address); + LOG_ERROR(RSX, "%d x %d vs %d x %d", width, height, tex.get_width(), tex.get_height()); } } + } - for (auto &tex : range_data.data) + for (auto &tex : range_data.data) + { + if (tex.is_dirty()) { - if (tex.is_dirty()) + if (tex.exists()) { - if (tex.exists()) - { - m_unreleased_texture_objects--; - free_texture_section(tex); - } - - range_data.notify(rsx_size); - return tex; + m_unreleased_texture_objects--; + free_texture_section(tex); } + + range_data.notify(rsx_size); + return tex; } } } - writer_lock lock(m_cache_mutex); - section_storage_type tmp; - m_cache[rsx_address].add(tmp, rsx_size); - return m_cache[rsx_address].data.back(); + m_cache[block_address].add(tmp, rsx_size); + return m_cache[block_address].data.back(); } section_storage_type* find_flushable_section(const u32 address, const u32 range) { - reader_lock lock(m_cache_mutex); - - auto found = m_cache.find(address); + auto found = m_cache.find(get_block_address(address)); if (found != m_cache.end()) { auto &range_data = found->second; @@ -199,9 +333,8 @@ namespace rsx template void lock_memory_region(image_storage_type* image, const u32 memory_address, const u32 memory_size, const u32 width, const u32 height, const u32 pitch, Args&&... extras) { - section_storage_type& region = find_cached_texture(memory_address, memory_size, true, width, height, 1); - writer_lock lock(m_cache_mutex); + section_storage_type& region = find_cached_texture(memory_address, memory_size, true, width, height, 1); if (!region.is_locked()) { @@ -217,6 +350,7 @@ namespace rsx template bool flush_memory_to_cache(const u32 memory_address, const u32 memory_size, bool skip_synchronized, Args&&... extra) { + writer_lock lock(m_cache_mutex); section_storage_type* region = find_flushable_section(memory_address, memory_size); //TODO: Make this an assertion @@ -236,6 +370,7 @@ namespace rsx template bool load_memory_from_cache(const u32 memory_address, const u32 memory_size, Args&&... extras) { + reader_lock lock(m_cache_mutex); section_storage_type *region = find_flushable_section(memory_address, memory_size); if (region && !region->is_dirty()) @@ -256,7 +391,7 @@ namespace rsx reader_lock lock(m_cache_mutex); - auto found = m_cache.find(address); + auto found = m_cache.find(get_block_address(address)); if (found != m_cache.end()) { auto &range_data = found->second; @@ -304,74 +439,8 @@ namespace rsx address > no_access_range.second) return false; - bool response = false; - std::pair trampled_range = std::make_pair(0xffffffff, 0x0); - std::unordered_map processed_ranges; - rsx::conditional_lock lock(in_access_violation_handler, m_cache_mutex); - - for (auto It = m_cache.begin(); It != m_cache.end(); It++) - { - auto &range_data = It->second; - const u32 base = It->first; - bool range_reset = false; - - if (processed_ranges[base] || range_data.valid_count == 0) - continue; - - //Quickly discard range - const u32 lock_base = base & ~0xfff; - const u32 lock_limit = align(range_data.max_range + base, 4096); - - if ((trampled_range.first >= lock_limit || lock_base >= trampled_range.second) && - (lock_base > address || lock_limit <= address)) - { - processed_ranges[base] = true; - continue; - } - - for (int i = 0; i < range_data.data.size(); i++) - { - auto &tex = range_data.data[i]; - - if (tex.is_dirty()) continue; - if (!tex.is_flushable()) continue; - - auto overlapped = tex.overlaps_page(trampled_range, address); - if (std::get<0>(overlapped)) - { - auto &new_range = std::get<1>(overlapped); - - if (new_range.first != trampled_range.first || - new_range.second != trampled_range.second) - { - i = 0; - trampled_range = new_range; - range_reset = true; - } - - //TODO: Map basic host_visible memory without coherent constraint - if (!tex.flush(std::forward(extras)...)) - { - //Missed address, note this - //TODO: Lower severity when successful to keep the cache from overworking - record_cache_miss(tex); - } - - response = true; - } - } - - if (range_reset) - { - processed_ranges.clear(); - It = m_cache.begin(); - } - - processed_ranges[base] = true; - } - - return response; + return flush_address_impl(address, std::forward(extras)...); } bool invalidate_address(u32 address) @@ -392,76 +461,8 @@ namespace rsx return false; } - bool response = false; - std::unordered_map processed_ranges; - rsx::conditional_lock lock(in_access_violation_handler, m_cache_mutex); - - for (auto It = m_cache.begin(); It != m_cache.end(); It++) - { - auto &range_data = It->second; - const u32 base = It->first; - bool range_reset = false; - - if (processed_ranges[base] || range_data.valid_count == 0) - continue; - - //Quickly discard range - const u32 lock_base = base & ~0xfff; - const u32 lock_limit = align(range_data.max_range + base, 4096); - - if (trampled_range.first >= lock_limit || lock_base >= trampled_range.second) - { - processed_ranges[base] = true; - continue; - } - - for (int i = 0; i < range_data.data.size(); i++) - { - auto &tex = range_data.data[i]; - - if (tex.is_dirty()) continue; - if (!tex.is_locked()) continue; //flushable sections can be 'clean' but unlocked. TODO: Handle this better - - auto overlapped = tex.overlaps_page(trampled_range, address); - if (std::get<0>(overlapped)) - { - auto &new_range = std::get<1>(overlapped); - - if (new_range.first != trampled_range.first || - new_range.second != trampled_range.second) - { - i = 0; - trampled_range = new_range; - range_reset = true; - } - - if (unprotect) - { - tex.set_dirty(true); - tex.unprotect(); - } - else - { - tex.discard(); - } - - m_unreleased_texture_objects++; - range_data.valid_count--; - response = true; - } - } - - if (range_reset) - { - processed_ranges.clear(); - It = m_cache.begin(); - } - - processed_ranges[base] = true; - } - - return response; + return invalidate_range_impl(address, range, unprotect); } void record_cache_miss(section_storage_type &tex) @@ -521,6 +522,8 @@ namespace rsx void purge_dirty() { + writer_lock lock(m_cache_mutex); + //Reclaims all graphics memory consumed by dirty textures std::vector empty_addresses; empty_addresses.resize(32); @@ -611,6 +614,17 @@ namespace rsx return texptr->get_view(); } + { + //Search in cache and upload/bind + reader_lock lock(m_cache_mutex); + + auto cached_texture = find_texture_from_dimensions(texaddr, tex_width, tex_height); + if (cached_texture) + { + return cached_texture->get_raw_view(); + } + } + /* Check if we are re-sampling a subresource of an RTV/DSV texture, bound or otherwise * (Turbo: Super Stunt Squad does this; bypassing the need for a sync object) * The engine does not read back the texture resource through cell, but specifies a texture location that is @@ -664,15 +678,6 @@ namespace rsx } } - //If all the above failed, then its probably a generic texture. - //Search in cache and upload/bind - - auto cached_texture = find_texture_from_dimensions(texaddr, tex_width, tex_height); - if (cached_texture) - { - return cached_texture->get_raw_view(); - } - //Do direct upload from CPU as the last resort const auto extended_dimension = tex.get_extended_texture_dimension(); u16 height = 0; @@ -698,12 +703,13 @@ namespace rsx break; } + writer_lock lock(m_cache_mutex); const bool is_swizzled = !(tex.format() & CELL_GCM_TEXTURE_LN); auto subresources_layout = get_subresources_layout(tex); auto remap_vector = tex.decoded_remap(); return upload_image_from_cpu(cmd, texaddr, tex_width, height, depth, tex.get_exact_mipmap_count(), tex_pitch, format, - subresources_layout, extended_dimension, is_swizzled, remap_vector)->get_raw_view(); + texture_upload_context::shader_read, subresources_layout, extended_dimension, is_swizzled, remap_vector)->get_raw_view(); } template @@ -770,7 +776,9 @@ namespace rsx } } + reader_lock lock(m_cache_mutex); section_storage_type* cached_dest = nullptr; + if (!dst_is_render_target) { //First check if this surface exists in VRAM with exact dimensions @@ -785,7 +793,7 @@ namespace rsx //Prep surface enforce_surface_creation_type(*cached_dest, dst.swizzled ? rsx::texture_create_flags::swapped_native_component_order : rsx::texture_create_flags::native_component_order); - //TODO: Move this code into utils since it is used alot + const auto old_dst_area = dst_area; if (const u32 address_offset = dst.rsx_address - cached_dest->get_section_base()) { const u16 bpp = dst_is_argb8 ? 4 : 2; @@ -809,11 +817,16 @@ namespace rsx max_dst_height = cached_dest->get_height(); } else + { cached_dest = nullptr; + dst_area = old_dst_area; + } } if (!cached_dest && is_memcpy) { + lock.upgrade(); + invalidate_range_impl(dst_address, memcpy_bytes_length, true); memcpy(dst.pixels, src.pixels, memcpy_bytes_length); return true; } @@ -839,6 +852,8 @@ namespace rsx if (rsx_pitch <= 64 && native_pitch != rsx_pitch) { + lock.upgrade(); + invalidate_range_impl(dst_address, memcpy_bytes_length, true); memcpy(dst.pixels, src.pixels, memcpy_bytes_length); return true; } @@ -856,7 +871,9 @@ namespace rsx } else { - flush_address(src.rsx_address, std::forward(extras)...); + lock.upgrade(); + + flush_address_impl(src_address, std::forward(extras)...); const u16 pitch_in_block = src_is_argb8 ? src.pitch >> 2 : src.pitch >> 1; std::vector subresource_layout; @@ -869,7 +886,7 @@ namespace rsx subresource_layout.push_back(subres); const u32 gcm_format = src_is_argb8 ? CELL_GCM_TEXTURE_A8R8G8B8 : CELL_GCM_TEXTURE_R5G6B5; - vram_texture = upload_image_from_cpu(cmd, src_address, src.width, src.slice_h, 1, 1, src.pitch, gcm_format, + vram_texture = upload_image_from_cpu(cmd, src_address, src.width, src.slice_h, 1, 1, src.pitch, gcm_format, texture_upload_context::blit_engine_src, subresource_layout, rsx::texture_dimension_extended::texture_dimension_2d, dst.swizzled, default_remap_vector)->get_raw_texture(); } } @@ -928,7 +945,8 @@ namespace rsx //TODO: Check for other types of format mismatch if (format_mismatch) { - invalidate_range(cached_dest->get_section_base(), cached_dest->get_section_size()); + lock.upgrade(); + invalidate_range_impl(cached_dest->get_section_base(), cached_dest->get_section_size(), true); dest_texture = 0; cached_dest = nullptr; @@ -958,6 +976,8 @@ namespace rsx else gcm_format = (dst_is_argb8) ? CELL_GCM_TEXTURE_A8R8G8B8 : CELL_GCM_TEXTURE_R5G6B5; + lock.upgrade(); + dest_texture = create_new_texture(cmd, dst.rsx_address, dst.pitch * dst.clip_height, dst_dimensions.width, dst_dimensions.height, 1, 1, gcm_format, rsx::texture_dimension_extended::texture_dimension_2d, diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index c9acf61565..2436d8501c 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -462,14 +462,15 @@ void GLGSRender::end() for (int i = 0; i < rsx::limits::fragment_textures_count; ++i) { int location; - if (!rsx::method_registers.fragment_textures[i].enabled()) - continue; - - if (m_program->uniforms.has_location("tex" + std::to_string(i), &location)) + if (rsx::method_registers.fragment_textures[i].enabled() && m_program->uniforms.has_location("tex" + std::to_string(i), &location)) { m_gl_texture_cache.upload_and_bind_texture(i, get_gl_target_for_texture(rsx::method_registers.fragment_textures[i]), rsx::method_registers.fragment_textures[i], m_rtts); - m_gl_sampler_states[i].apply(rsx::method_registers.fragment_textures[i]); + + if (m_textures_dirty[i]) + m_gl_sampler_states[i].apply(rsx::method_registers.fragment_textures[i]); } + + m_textures_dirty[i] = false; } //Vertex textures diff --git a/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp b/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp index dbad2b3b2e..e1f3ee3322 100644 --- a/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp +++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp @@ -186,8 +186,22 @@ void GLGSRender::init_buffers(bool skip_reading) draw_fbo.recreate(); + bool old_format_found = false; + gl::texture::format old_format; + for (int i = 0; i < rsx::limits::color_buffers_count; ++i) { + if (surface_info[i].pitch && g_cfg.video.write_color_buffers) + { + if (!old_format_found) + { + old_format = rsx::internals::surface_color_format_to_gl(surface_info[i].color_format).format; + old_format_found = true; + } + + m_gl_texture_cache.flush_if_cache_miss_likely(old_format, surface_info[i].address, surface_info[i].pitch * surface_info[i].height); + } + if (std::get<0>(m_rtts.m_bound_render_targets[i])) { __glcheck draw_fbo.color[i] = *std::get<1>(m_rtts.m_bound_render_targets[i]); diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index 2b2cb2a53a..3bc06e2a1d 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -36,7 +36,7 @@ namespace gl case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return GL_COMPRESSED_RGBA_S3TC_DXT3_EXT; case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return GL_COMPRESSED_RGBA_S3TC_DXT5_EXT; } - fmt::throw_exception("Compressed or unknown texture format 0x%x" HERE, texture_format); + fmt::throw_exception("Unknown texture format 0x%x" HERE, texture_format); } std::tuple get_format_type(u32 texture_format) @@ -63,6 +63,9 @@ namespace gl case CELL_GCM_TEXTURE_D1R5G5B5: return std::make_tuple(GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV); case CELL_GCM_TEXTURE_D8R8G8B8: return std::make_tuple(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8); case CELL_GCM_TEXTURE_Y16_X16_FLOAT: return std::make_tuple(GL_RG, GL_HALF_FLOAT); + case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return std::make_tuple(GL_COMPRESSED_RGBA_S3TC_DXT1_EXT, GL_UNSIGNED_BYTE); + case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return std::make_tuple(GL_COMPRESSED_RGBA_S3TC_DXT3_EXT, GL_UNSIGNED_BYTE); + case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return std::make_tuple(GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_UNSIGNED_BYTE); } fmt::throw_exception("Compressed or unknown texture format 0x%x" HERE, texture_format); } @@ -333,7 +336,7 @@ namespace gl } void fill_texture(rsx::texture_dimension_extended dim, u16 mipmap_count, int format, u16 width, u16 height, u16 depth, - const std::vector &input_layouts, bool is_swizzled, std::vector staging_buffer) + const std::vector &input_layouts, bool is_swizzled, GLenum gl_format, GLenum gl_type, std::vector staging_buffer) { int mip_level = 0; if (is_compressed_format(format)) @@ -349,11 +352,10 @@ namespace gl glTexStorage1D(GL_TEXTURE_1D, mipmap_count, get_sized_internal_format(format), width); if (!is_compressed_format(format)) { - const auto &format_type = get_format_type(format); for (const rsx_subresource_layout &layout : input_layouts) { upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4); - glTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block, std::get<0>(format_type), std::get<1>(format_type), staging_buffer.data()); + glTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block, gl_format, gl_type, staging_buffer.data()); } } else @@ -362,7 +364,7 @@ namespace gl { u32 size = layout.width_in_block * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16); upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4); - glCompressedTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block * 4, get_sized_internal_format(format), size, staging_buffer.data()); + glCompressedTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block * 4, gl_format, size, staging_buffer.data()); } } return; @@ -372,11 +374,10 @@ namespace gl { if (!is_compressed_format(format)) { - const auto &format_type = get_format_type(format); for (const rsx_subresource_layout &layout : input_layouts) { upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4); - glTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block, layout.height_in_block, std::get<0>(format_type), std::get<1>(format_type), staging_buffer.data()); + glTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block, layout.height_in_block, gl_format, gl_type, staging_buffer.data()); } } else @@ -385,7 +386,7 @@ namespace gl { u32 size = layout.width_in_block * layout.height_in_block * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16); upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4); - glCompressedTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, get_sized_internal_format(format), size, staging_buffer.data()); + glCompressedTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, gl_format, size, staging_buffer.data()); } } return; @@ -398,11 +399,10 @@ namespace gl // mip_level % mipmap_per_layer will always be equal to mip_level if (!is_compressed_format(format)) { - const auto &format_type = get_format_type(format); for (const rsx_subresource_layout &layout : input_layouts) { upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4); - glTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block, layout.height_in_block, std::get<0>(format_type), std::get<1>(format_type), staging_buffer.data()); + glTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block, layout.height_in_block, gl_format, gl_type, staging_buffer.data()); mip_level++; } } @@ -412,7 +412,7 @@ namespace gl { u32 size = layout.width_in_block * layout.height_in_block * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16); upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4); - glCompressedTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, get_sized_internal_format(format), size, staging_buffer.data()); + glCompressedTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, gl_format, size, staging_buffer.data()); mip_level++; } } @@ -423,11 +423,10 @@ namespace gl { if (!is_compressed_format(format)) { - const auto &format_type = get_format_type(format); for (const rsx_subresource_layout &layout : input_layouts) { upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4); - glTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block, layout.height_in_block, depth, std::get<0>(format_type), std::get<1>(format_type), staging_buffer.data()); + glTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block, layout.height_in_block, depth, gl_format, gl_type, staging_buffer.data()); } } else @@ -436,7 +435,7 @@ namespace gl { u32 size = layout.width_in_block * layout.height_in_block * layout.depth * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16); upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4); - glCompressedTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, layout.depth, get_sized_internal_format(format), size, staging_buffer.data()); + glCompressedTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, layout.depth, gl_format, size, staging_buffer.data()); } } return; @@ -529,6 +528,9 @@ namespace gl //The rest of sampler state is now handled by sampler state objects - fill_texture(type, mipmaps, gcm_format, width, height, depth, subresources_layout, is_swizzled, data_upload_buf); + const auto format_type = get_format_type(gcm_format); + const GLenum gl_format = std::get<0>(format_type); + const GLenum gl_type = std::get<1>(format_type); + fill_texture(type, mipmaps, gcm_format, width, height, depth, subresources_layout, is_swizzled, gl_format, gl_type, data_upload_buf); } } diff --git a/rpcs3/Emu/RSX/GL/GLTexture.h b/rpcs3/Emu/RSX/GL/GLTexture.h index 1f7ba0f075..f2987be24a 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.h +++ b/rpcs3/Emu/RSX/GL/GLTexture.h @@ -17,6 +17,14 @@ namespace gl GLuint create_texture(u32 gcm_format, u16 width, u16 height, u16 depth, u16 mipmaps, rsx::texture_dimension_extended type); + /** + * is_swizzled - determines whether input bytes are in morton order + * subresources_layout - descriptor of the mipmap levels in memory + * decoded_remap - two vectors, first one contains index to read, e.g if v[0] = 1 then component 0[A] in the texture should read as component 1[R] + * - layout of vector is in A-R-G-B + * - second vector contains overrides to force the value to either 0 or 1 instead of reading from texture + * static_state - set up the texture without consideration for sampler state (useful for vertex textures which have no real sampler state on RSX) + */ void upload_texture(const GLuint id, const u32 texaddr, const u32 gcm_format, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, bool is_swizzled, rsx::texture_dimension_extended type, std::vector& subresources_layout, std::pair, std::array>& decoded_remap, bool static_state); diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index 1bac9f2e27..efdc55a2b8 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -47,6 +47,8 @@ namespace gl texture::type type = texture::type::ubyte; bool pack_unpack_swap_bytes = false; + rsx::texture_create_flags view_flags = rsx::texture_create_flags::default_component_order; + u8 get_pixel_size(texture::format fmt_, texture::type type_) { u8 size = 1; @@ -224,6 +226,11 @@ namespace gl vram_texture = source.id(); } + void set_view_flags(const rsx::texture_create_flags flags) + { + view_flags = flags; + } + void copy_texture(bool=false) { if (!glIsTexture(vram_texture)) @@ -306,7 +313,6 @@ namespace gl glUnmapBuffer(GL_PIXEL_PACK_BUFFER); glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); - protect(utils::protection::ro); return true; } @@ -410,6 +416,11 @@ namespace gl return (gl::texture::format)fmt == tex->get_internal_format(); } + + rsx::texture_create_flags get_view_flags() const + { + return view_flags; + } }; class texture_cache : public rsx::texture_cache @@ -577,28 +588,54 @@ namespace gl break; } + if (flags == rsx::texture_create_flags::swapped_native_component_order) + { + glBindTexture(GL_TEXTURE_2D, vram_texture); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_R, GL_ALPHA); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_G, GL_RED); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_B, GL_GREEN); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_A, GL_BLUE); + } + auto& cached = create_texture(vram_texture, rsx_address, rsx_size, width, height); cached.protect(utils::protection::ro); cached.set_dirty(false); cached.set_depth_flag(depth_flag); + cached.set_view_flags(flags); return &cached; } cached_texture_section* upload_image_from_cpu(void*&, u32 rsx_address, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, const u32 gcm_format, - std::vector& subresource_layout, const rsx::texture_dimension_extended type, const bool swizzled, + const rsx::texture_upload_context context, std::vector& subresource_layout, const rsx::texture_dimension_extended type, const bool swizzled, std::pair, std::array>& remap_vector) override { void* unused = nullptr; auto section = create_new_texture(unused, rsx_address, pitch * height, width, height, depth, mipmaps, gcm_format, type, rsx::texture_create_flags::default_component_order, remap_vector); - gl::upload_texture(section->get_raw_texture(), rsx_address, gcm_format, width, height, depth, mipmaps, pitch, swizzled, type, subresource_layout, remap_vector, false); + //Swizzling is ignored for blit engine copy and emulated using remapping + bool input_swizzled = (context == rsx::texture_upload_context::blit_engine_src)? false : swizzled; + + gl::upload_texture(section->get_raw_texture(), rsx_address, gcm_format, width, height, depth, mipmaps, pitch, input_swizzled, type, subresource_layout, remap_vector, false); return section; } void enforce_surface_creation_type(cached_texture_section& section, const rsx::texture_create_flags flags) override { + if (flags == section.get_view_flags()) + return; + + if (flags == rsx::texture_create_flags::swapped_native_component_order) + { + glBindTexture(GL_TEXTURE_2D, section.get_raw_texture()); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_R, GL_ALPHA); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_G, GL_RED); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_B, GL_GREEN); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_A, GL_BLUE); + } + + section.set_view_flags(flags); } void insert_texture_barrier() override @@ -630,6 +667,8 @@ namespace gl bool is_depth_texture(const u32 rsx_address) override { + reader_lock lock(m_cache_mutex); + auto section = find_texture_from_range(rsx_address, 64u); if (section != nullptr) return section->is_depth_texture(); diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index e39489feda..b59773cce0 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -403,7 +403,8 @@ namespace rsx std::vector > split_ranges; auto first_count_cmds = method_registers.current_draw_clause.first_count_commands; - if (method_registers.current_draw_clause.first_count_commands.size() > 1) + if (method_registers.current_draw_clause.first_count_commands.size() > 1 && + method_registers.current_draw_clause.is_disjoint_primitive) { u32 next = method_registers.current_draw_clause.first_count_commands.front().first; u32 last_head = 0; @@ -433,13 +434,18 @@ namespace rsx { std::vector> tmp; auto list_head = first_count_cmds.begin(); + bool emit_begin = false; for (auto &range : split_ranges) { tmp.resize(range.second - range.first + 1); std::copy(list_head + range.first, list_head + range.second, tmp.begin()); - methods[NV4097_SET_BEGIN_END](this, NV4097_SET_BEGIN_END, deferred_primitive_type); + if (emit_begin) + methods[NV4097_SET_BEGIN_END](this, NV4097_SET_BEGIN_END, deferred_primitive_type); + else + emit_begin = true; + method_registers.current_draw_clause.first_count_commands = tmp; methods[NV4097_SET_BEGIN_END](this, NV4097_SET_BEGIN_END, 0); } @@ -565,41 +571,44 @@ namespace rsx deferred_primitive_type = value; else { - deferred_call_size++; - - // Combine all calls since the last one - auto &first_count = method_registers.current_draw_clause.first_count_commands; - if (first_count.size() > deferred_call_size) - { - const auto &batch_first_count = first_count[deferred_call_size - 1]; - u32 count = batch_first_count.second; - u32 next = batch_first_count.first + count; - - for (int n = deferred_call_size; n < first_count.size(); n++) - { - if (first_count[n].first != next) - { - LOG_ERROR(RSX, "Non-continous first-count range passed as one draw; will be split."); - - first_count[deferred_call_size - 1].second = count; - deferred_call_size++; - - count = first_count[deferred_call_size - 1].second; - next = first_count[deferred_call_size - 1].first + count; - continue; - } - - count += first_count[n].second; - next += first_count[n].second; - } - - first_count[deferred_call_size - 1].second = count; - first_count.resize(deferred_call_size); - } - has_deferred_call = true; flush_commands_flag = false; execute_method_call = false; + + deferred_call_size++; + + if (method_registers.current_draw_clause.is_disjoint_primitive) + { + // Combine all calls since the last one + auto &first_count = method_registers.current_draw_clause.first_count_commands; + if (first_count.size() > deferred_call_size) + { + const auto &batch_first_count = first_count[deferred_call_size - 1]; + u32 count = batch_first_count.second; + u32 next = batch_first_count.first + count; + + for (int n = deferred_call_size; n < first_count.size(); n++) + { + if (first_count[n].first != next) + { + LOG_ERROR(RSX, "Non-continous first-count range passed as one draw; will be split."); + + first_count[deferred_call_size - 1].second = count; + deferred_call_size++; + + count = first_count[deferred_call_size - 1].second; + next = first_count[deferred_call_size - 1].first + count; + continue; + } + + count += first_count[n].second; + next += first_count[n].second; + } + + first_count[deferred_call_size - 1].second = count; + first_count.resize(deferred_call_size); + } + } } break; @@ -1049,24 +1058,33 @@ namespace rsx void thread::get_current_vertex_program() { - auto &result = current_vertex_program = {}; - const u32 transform_program_start = rsx::method_registers.transform_program_start(); - result.data.reserve((512 - transform_program_start) * 4); - result.rsx_vertex_inputs.reserve(rsx::limits::vertex_count); + current_vertex_program.output_mask = rsx::method_registers.vertex_attrib_output_mask(); + current_vertex_program.skip_vertex_input_check = false; + + current_vertex_program.rsx_vertex_inputs.resize(0); + current_vertex_program.data.resize(512 * 4); + current_vertex_program.rsx_vertex_inputs.reserve(rsx::limits::vertex_count); + + u32* ucode_src = rsx::method_registers.transform_program.data() + (transform_program_start * 4); + u32* ucode_dst = current_vertex_program.data.data(); + u32 ucode_size = 0; + D3 d3; for (int i = transform_program_start; i < 512; ++i) { - result.data.resize((i - transform_program_start) * 4 + 4); - memcpy(result.data.data() + (i - transform_program_start) * 4, rsx::method_registers.transform_program.data() + i * 4, 4 * sizeof(u32)); - - D3 d3; - d3.HEX = rsx::method_registers.transform_program[i * 4 + 3]; + ucode_size += 4; + memcpy(ucode_dst, ucode_src, 4 * sizeof(u32)); + d3.HEX = ucode_src[3]; if (d3.end) break; + + ucode_src += 4; + ucode_dst += 4; } - result.output_mask = rsx::method_registers.vertex_attrib_output_mask(); + + current_vertex_program.data.resize(ucode_size); const u32 input_mask = rsx::method_registers.vertex_attrib_input_mask(); const u32 modulo_mask = rsx::method_registers.frequency_divider_operation_mask(); @@ -1079,7 +1097,7 @@ namespace rsx if (rsx::method_registers.vertex_arrays_info[index].size() > 0) { - result.rsx_vertex_inputs.push_back( + current_vertex_program.rsx_vertex_inputs.push_back( {index, rsx::method_registers.vertex_arrays_info[index].size(), rsx::method_registers.vertex_arrays_info[index].frequency(), @@ -1089,7 +1107,7 @@ namespace rsx } else if (vertex_push_buffers[index].vertex_count > 1) { - result.rsx_vertex_inputs.push_back( + current_vertex_program.rsx_vertex_inputs.push_back( { index, rsx::method_registers.register_vertex_info[index].size, 1, @@ -1099,7 +1117,7 @@ namespace rsx } else if (rsx::method_registers.register_vertex_info[index].size > 0) { - result.rsx_vertex_inputs.push_back( + current_vertex_program.rsx_vertex_inputs.push_back( {index, rsx::method_registers.register_vertex_info[index].size, rsx::method_registers.register_vertex_info[index].frequency, diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index 1b0193c329..c7395d3f01 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -565,7 +565,7 @@ namespace vk } cached_texture_section* upload_image_from_cpu(vk::command_buffer& cmd, u32 rsx_address, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, const u32 gcm_format, - std::vector& subresource_layout, const rsx::texture_dimension_extended type, const bool swizzled, + const rsx::texture_upload_context context, std::vector& subresource_layout, const rsx::texture_dimension_extended type, const bool swizzled, std::pair, std::array>& remap_vector) override { auto section = create_new_texture(cmd, rsx_address, pitch * height, width, height, depth, mipmaps, gcm_format, type, @@ -578,7 +578,10 @@ namespace vk vk::enter_uninterruptible(); - vk::copy_mipmaped_image_using_buffer(cmd, image->value, subresource_layout, gcm_format, swizzled, mipmaps, subres_range.aspectMask, + //Swizzling is ignored for blit engine copy and emulated using a swapped order image view + bool input_swizzled = (context == rsx::texture_upload_context::blit_engine_src) ? false : swizzled; + + vk::copy_mipmaped_image_using_buffer(cmd, image->value, subresource_layout, gcm_format, input_swizzled, mipmaps, subres_range.aspectMask, *m_texture_upload_heap, m_texture_upload_buffer); vk::leave_uninterruptible(); diff --git a/rpcs3/Emu/RSX/rsx_cache.h b/rpcs3/Emu/RSX/rsx_cache.h index 930701aec2..df14d41f10 100644 --- a/rpcs3/Emu/RSX/rsx_cache.h +++ b/rpcs3/Emu/RSX/rsx_cache.h @@ -70,7 +70,7 @@ namespace rsx bool locked = false; bool dirty = false; - inline bool region_overlaps(u32 base1, u32 limit1, u32 base2, u32 limit2) + inline bool region_overlaps(u32 base1, u32 limit1, u32 base2, u32 limit2) const { return (base1 < limit2 && base2 < limit1); } @@ -133,12 +133,12 @@ namespace rsx locked = false; } - bool overlaps(std::pair range) + bool overlaps(std::pair range) const { return region_overlaps(locked_address_base, locked_address_base + locked_address_range, range.first, range.first + range.second); } - bool overlaps(u32 address) + bool overlaps(u32 address) const { return (locked_address_base <= address && (address - locked_address_base) < locked_address_range); } @@ -148,7 +148,7 @@ namespace rsx * ignore_protection_range - if true, the test should not check against the aligned protection range, instead * tests against actual range of contents in memory */ - bool overlaps(std::pair range, bool ignore_protection_range) + bool overlaps(std::pair range, bool ignore_protection_range) const { if (!ignore_protection_range) return region_overlaps(locked_address_base, locked_address_base + locked_address_range, range.first, range.first + range.second); @@ -160,7 +160,7 @@ namespace rsx * Check if the page containing the address tramples this section. Also compares a former trampled page range to compare * If true, returns the range with updated invalid range */ - std::tuple> overlaps_page(std::pair old_range, u32 address) + std::tuple> overlaps_page(std::pair old_range, u32 address) const { const u32 page_base = address & ~4095; const u32 page_limit = address + 4096; @@ -204,7 +204,7 @@ namespace rsx return (cpu_address_base == cpu_address && cpu_address_range == size); } - std::pair get_min_max(std::pair current_min_max) + std::pair get_min_max(std::pair current_min_max) const { u32 min = std::min(current_min_max.first, locked_address_base); u32 max = std::max(current_min_max.second, locked_address_base + locked_address_range);