rsx: Texture cache improvements

- Limits buffer size to min 720 in the Y axis (1024 section causes conflicts in some cases - TODO)
rsx: Fixups to allow large textures for blit operation
- Also includes checks for both leaking sections and blit regions for vulkan
hotfix for hanging when using WCB
addendum - unlock both ro and no blocks before attempting to copy memory blocks
gl: Fixups for ARB_explicit_uniform_location
- Forces glsl v 430 to make use of the extension
rsx/vk: Rework texture cache to minimize recursive access violations
- Also modifies the vulkan commandbuffer begin/end/submit mechanism
gl: Fix cached_texture_section::is_flushable to take memory protection into account
rsx: Fix blit dst offset calculation
This commit is contained in:
kd-11 2017-09-15 01:32:23 +03:00
parent 10e25eb362
commit 571dbfb7b1
11 changed files with 168 additions and 94 deletions

View File

@ -523,7 +523,7 @@ namespace rsx
* address_is_bound - returns true if the surface at a given address is actively bound
* get_surface_subresource_if_available - returns a sectiion descriptor that allows to crop surfaces stored in memory
*/
bool surface_overlaps_address(surface_type surface, u32 surface_address, u32 texaddr, u16 *x, u16 *y, bool scale_to_fit)
bool surface_overlaps_address(surface_type surface, u32 surface_address, u32 texaddr, u16 *x, u16 *y, bool scale_to_fit, bool double_height)
{
bool is_subslice = false;
u16 x_offset = 0;
@ -535,7 +535,6 @@ namespace rsx
u32 offset = texaddr - surface_address;
if (texaddr >= surface_address)
{
if (offset == 0)
{
is_subslice = true;
@ -546,6 +545,8 @@ namespace rsx
Traits::get_surface_info(surface, &info);
u32 range = info.rsx_pitch * info.surface_height;
if (double_height) range *= 2;
if (offset < range)
{
const u32 y = (offset / info.rsx_pitch);
@ -560,6 +561,7 @@ namespace rsx
x_offset = x;
y_offset = y;
if (double_height) y_offset /= 2;
is_subslice = true;
}
}
@ -602,11 +604,11 @@ namespace rsx
return true;
}
surface_subresource get_surface_subresource_if_applicable(u32 texaddr, u16 requested_width, u16 requested_height, u16 requested_pitch, bool scale_to_fit = false, bool crop = false, bool ignore_depth_formats = false)
surface_subresource get_surface_subresource_if_applicable(u32 texaddr, u16 requested_width, u16 requested_height, u16 requested_pitch, bool scale_to_fit = false, bool crop = false, bool ignore_depth_formats = false, bool double_height = false)
{
auto test_surface = [&](surface_type surface, u32 this_address, u16 &x_offset, u16 &y_offset, u16 &w, u16 &h, bool &clipped)
{
if (surface_overlaps_address(surface, this_address, texaddr, &x_offset, &y_offset, scale_to_fit))
if (surface_overlaps_address(surface, this_address, texaddr, &x_offset, &y_offset, scale_to_fit, double_height))
{
surface_format_info info;
Traits::get_surface_info(surface, &info);
@ -625,7 +627,7 @@ namespace rsx
if (region_fits(info.surface_width, info.surface_height, x_offset, y_offset, real_width, requested_height))
{
w = real_width;
h = info.surface_height;
h = requested_height;
clipped = false;
return true;

View File

@ -53,6 +53,12 @@ namespace rsx
data.push_back(std::move(section));
}
void remove_one()
{
verify(HERE), valid_count > 0;
valid_count--;
}
};
// Keep track of cache misses to pre-emptively flush some addresses
@ -104,8 +110,12 @@ namespace rsx
if (base == last_dirty_block && range_data.valid_count == 0)
continue;
if (trampled_range.first >= (base + get_block_size()) || base >= trampled_range.second)
if (trampled_range.first < trampled_range.second)
{
//Only if a valid range, ignore empty sets
if (trampled_range.first >= (base + range_data.max_range + get_block_size()) || base >= trampled_range.second)
continue;
}
for (int i = 0; i < range_data.data.size(); i++)
{
@ -138,7 +148,7 @@ namespace rsx
}
m_unreleased_texture_objects++;
range_data.valid_count--;
range_data.remove_one();
response = true;
}
}
@ -159,6 +169,7 @@ namespace rsx
bool response = false;
u32 last_dirty_block = 0;
std::pair<u32, u32> trampled_range = std::make_pair(0xffffffff, 0x0);
std::vector<section_storage_type*> sections_to_flush;
for (auto It = m_cache.begin(); It != m_cache.end(); It++)
{
@ -169,8 +180,12 @@ namespace rsx
if (base == last_dirty_block && range_data.valid_count == 0)
continue;
if (trampled_range.first >= (base + get_block_size()) || base >= trampled_range.second)
if (trampled_range.first < trampled_range.second)
{
//Only if a valid range, ignore empty sets
if (trampled_range.first >= (base + range_data.max_range + get_block_size()) || base >= trampled_range.second)
continue;
}
for (int i = 0; i < range_data.data.size(); i++)
{
@ -192,16 +207,12 @@ namespace rsx
range_reset = true;
}
//TODO: Map basic host_visible memory without coherent constraint
if (!tex.flush(std::forward<Args>(extras)...))
{
//Missed address, note this
//TODO: Lower severity when successful to keep the cache from overworking
record_cache_miss(tex);
}
//Defer actual flush operation until all affected regions are cleared to prevent recursion
tex.unprotect();
sections_to_flush.push_back(&tex);
response = true;
range_data.valid_count--;
range_data.remove_one();
}
}
@ -211,6 +222,16 @@ namespace rsx
}
}
for (auto tex : sections_to_flush)
{
if (!tex->flush(std::forward<Args>(extras)...))
{
//Missed address, note this
//TODO: Lower severity when successful to keep the cache from overworking
record_cache_miss(*tex);
}
}
return response;
}
@ -334,7 +355,7 @@ namespace rsx
void lock_memory_region(image_storage_type* image, const u32 memory_address, const u32 memory_size, const u32 width, const u32 height, const u32 pitch, Args&&... extras)
{
writer_lock lock(m_cache_mutex);
section_storage_type& region = find_cached_texture(memory_address, memory_size, true, width, height, 1);
section_storage_type& region = find_cached_texture(memory_address, memory_size, false);
if (!region.is_locked())
{
@ -389,7 +410,7 @@ namespace rsx
address > no_access_range.second)
return std::make_tuple(false, nullptr);
reader_lock lock(m_cache_mutex);
rsx::conditional_lock<shared_mutex> lock(in_access_violation_handler, m_cache_mutex);
auto found = m_cache.find(get_block_address(address));
if (found != m_cache.end())
@ -730,11 +751,11 @@ namespace rsx
const u32 dst_address = (u32)((u64)dst.pixels - (u64)vm::base(0));
//Check if src/dst are parts of render targets
auto dst_subres = m_rtts.get_surface_subresource_if_applicable(dst.rsx_address, dst.width, dst.clip_height, dst.pitch, true, true, false);
auto dst_subres = m_rtts.get_surface_subresource_if_applicable(dst.rsx_address, dst.width, dst.clip_height, dst.pitch, true, true, false, dst.compressed_y);
dst_is_render_target = dst_subres.surface != nullptr;
//TODO: Handle cases where src or dst can be a depth texture while the other is a color texture - requires a render pass to emulate
auto src_subres = m_rtts.get_surface_subresource_if_applicable(src.rsx_address, src.width, src.height, src.pitch, true, true, false);
auto src_subres = m_rtts.get_surface_subresource_if_applicable(src.rsx_address, src.width, src.slice_h, src.pitch, true, true, false, src.compressed_y);
src_is_render_target = src_subres.surface != nullptr;
//Always use GPU blit if src or dst is in the surface store
@ -747,15 +768,37 @@ namespace rsx
float scale_x = dst.scale_x;
float scale_y = dst.scale_y;
size2i clip_dimensions = { dst.clip_width, dst.clip_height };
//TODO: Investigate effects of compression in X axis
if (dst.compressed_y)
{
scale_y *= 0.5f;
}
//Dimensions passed are restricted to powers of 2; get real height from clip_height and width from pitch
size2i dst_dimensions = { dst.pitch / (dst_is_argb8 ? 4 : 2), dst.clip_height };
if (src.compressed_y)
{
scale_y *= 2.f;
}
//1024 height is a hack (for ~720p buffers)
//It is possible to have a large buffer that goes up to around 4kx4k but anything above 1280x720 is rare
//RSX only handles 512x512 tiles so texture 'stitching' will eventually be needed to be completely accurate
//Sections will be submitted as (512x512 + 512x512 + 256x512 + 512x208 + 512x208 + 256x208) to blit a 720p surface to the backbuffer for example
int practical_height;
if (dst.max_tile_h < dst.height || !src_is_render_target)
practical_height = (s32)dst.height;
else
{
//Hack
practical_height = std::min((s32)dst.max_tile_h, 1024);
}
size2i dst_dimensions = { dst.pitch / (dst_is_argb8 ? 4 : 2), practical_height };
//Offset in x and y for src is 0 (it is already accounted for when getting pixels_src)
//Reproject final clip onto source...
const u16 src_w = (const u16)((f32)clip_dimensions.width / dst.scale_x);
const u16 src_h = (const u16)((f32)clip_dimensions.height / dst.scale_y);
const u16 src_w = (const u16)((f32)dst.clip_width / scale_x);
const u16 src_h = (const u16)((f32)dst.clip_height / scale_y);
areai src_area = { 0, 0, src_w, src_h };
areai dst_area = { 0, 0, dst.clip_width, dst.clip_height };
@ -794,7 +837,7 @@ namespace rsx
enforce_surface_creation_type(*cached_dest, dst.swizzled ? rsx::texture_create_flags::swapped_native_component_order : rsx::texture_create_flags::native_component_order);
const auto old_dst_area = dst_area;
if (const u32 address_offset = dst.rsx_address - cached_dest->get_section_base())
if (const u32 address_offset = dst_address - cached_dest->get_section_base())
{
const u16 bpp = dst_is_argb8 ? 4 : 2;
const u16 offset_y = address_offset / dst.pitch;
@ -826,6 +869,7 @@ namespace rsx
if (!cached_dest && is_memcpy)
{
lock.upgrade();
flush_address_impl(src_address, std::forward<Args>(extras)...);
invalidate_range_impl(dst_address, memcpy_bytes_length, true);
memcpy(dst.pixels, src.pixels, memcpy_bytes_length);
return true;
@ -853,6 +897,7 @@ namespace rsx
if (rsx_pitch <= 64 && native_pitch != rsx_pitch)
{
lock.upgrade();
flush_address_impl(src_address, std::forward<Args>(extras)...);
invalidate_range_impl(dst_address, memcpy_bytes_length, true);
memcpy(dst.pixels, src.pixels, memcpy_bytes_length);
return true;
@ -892,13 +937,13 @@ namespace rsx
}
else
{
if (src_subres.w != clip_dimensions.width ||
src_subres.h != clip_dimensions.height)
if (src_subres.w != dst.clip_width ||
src_subres.h != dst.clip_height)
{
f32 subres_scaling_x = (f32)src.pitch / src_subres.surface->get_native_pitch();
const int dst_width = (int)(src_subres.w * dst.scale_x * subres_scaling_x);
const int dst_height = (int)(src_subres.h * dst.scale_y);
const int dst_width = (int)(src_subres.w * scale_x * subres_scaling_x);
const int dst_height = (int)(src_subres.h * scale_y);
dst_area.x2 = dst_area.x1 + dst_width;
dst_area.y2 = dst_area.y1 + dst_height;
@ -912,14 +957,6 @@ namespace rsx
src_area.y1 += src_subres.y;
src_area.y2 += src_subres.y;
if (src.compressed_y)
{
dst_area.y1 *= 2;
dst_area.y2 *= 2;
dst_dimensions.height *= 2;
}
vram_texture = src_subres.surface->get_surface();
}
@ -959,8 +996,8 @@ namespace rsx
//Reproject clip offsets onto source to simplify blit
if (dst.clip_x || dst.clip_y)
{
const u16 scaled_clip_offset_x = (const u16)((f32)dst.clip_x / dst.scale_x);
const u16 scaled_clip_offset_y = (const u16)((f32)dst.clip_y / dst.scale_y);
const u16 scaled_clip_offset_x = (const u16)((f32)dst.clip_x / scale_x);
const u16 scaled_clip_offset_y = (const u16)((f32)dst.clip_y / scale_y);
src_area.x1 += scaled_clip_offset_x;
src_area.x2 += scaled_clip_offset_x;
@ -978,7 +1015,7 @@ namespace rsx
lock.upgrade();
dest_texture = create_new_texture(cmd, dst.rsx_address, dst.pitch * dst.clip_height,
dest_texture = create_new_texture(cmd, dst.rsx_address, dst.pitch * dst_dimensions.height,
dst_dimensions.width, dst_dimensions.height, 1, 1,
gcm_format, rsx::texture_dimension_extended::texture_dimension_2d,
dst.swizzled? rsx::texture_create_flags::swapped_native_component_order : rsx::texture_create_flags::native_component_order,

View File

@ -30,7 +30,7 @@ std::string GLFragmentDecompilerThread::compareFunction(COMPARE f, const std::st
void GLFragmentDecompilerThread::insertHeader(std::stringstream & OS)
{
OS << "#version 420\n";
OS << "#version 430\n";
}
void GLFragmentDecompilerThread::insertIntputs(std::stringstream & OS)

View File

@ -355,7 +355,7 @@ namespace gl
bool is_flushable() const
{
return pbo_id != 0;
return (locked && pbo_id != 0);
}
bool is_flushed() const

View File

@ -32,7 +32,6 @@ std::string GLVertexDecompilerThread::compareFunction(COMPARE f, const std::stri
void GLVertexDecompilerThread::insertHeader(std::stringstream &OS)
{
OS << "#version 430\n";
OS << "#extension GL_ARB_separate_program_objects: enable\n\n";
OS << "layout(std140, binding = 0) uniform VertexContextBuffer\n";
OS << "{\n";
OS << " mat4 scale_offset_mat;\n";

View File

@ -399,7 +399,7 @@ namespace rsx
auto flush_command_queue = [&]()
{
const auto num_draws = method_registers.current_draw_clause.first_count_commands.size();
const auto num_draws = (u32)method_registers.current_draw_clause.first_count_commands.size();
bool emit_begin = false;
bool emit_end = true;
@ -411,7 +411,7 @@ namespace rsx
u32 last = first_counts.front().first;
u32 last_index = 0;
for (size_t draw = 0; draw < num_draws; draw++)
for (u32 draw = 0; draw < num_draws; draw++)
{
if (first_counts[draw].first != last)
{

View File

@ -2013,32 +2013,14 @@ void VKGSRender::write_buffers()
void VKGSRender::close_and_submit_command_buffer(const std::vector<VkSemaphore> &semaphores, VkFence fence, VkPipelineStageFlags pipeline_stage_flags)
{
CHECK_RESULT(vkEndCommandBuffer(*m_current_command_buffer));
VkCommandBuffer cmd = *m_current_command_buffer;
VkSubmitInfo infos = {};
infos.commandBufferCount = 1;
infos.pCommandBuffers = &cmd;
infos.pWaitDstStageMask = &pipeline_stage_flags;
infos.pWaitSemaphores = semaphores.data();
infos.waitSemaphoreCount = static_cast<uint32_t>(semaphores.size());
infos.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
m_current_command_buffer->end();
m_current_command_buffer->tag();
CHECK_RESULT(vkQueueSubmit(m_swap_chain->get_present_queue(), 1, &infos, fence));
m_current_command_buffer->submit(m_swap_chain->get_present_queue(), semaphores, fence, pipeline_stage_flags);
}
void VKGSRender::open_command_buffer()
{
VkCommandBufferInheritanceInfo inheritance_info = {};
inheritance_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO;
VkCommandBufferBeginInfo begin_infos = {};
begin_infos.pInheritanceInfo = &inheritance_info;
begin_infos.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
begin_infos.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
CHECK_RESULT(vkBeginCommandBuffer(*m_current_command_buffer, &begin_infos));
m_current_command_buffer->begin();
}
@ -2587,5 +2569,8 @@ bool VKGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst
{
close_render_pass();
return m_texture_cache.blit(src, dst, interpolate, m_rtts, *m_current_command_buffer);
auto result = m_texture_cache.blit(src, dst, interpolate, m_rtts, *m_current_command_buffer);
m_current_command_buffer->begin();
return result;
}

View File

@ -1040,6 +1040,9 @@ namespace vk
class command_buffer
{
private:
bool is_open = false;
protected:
vk::command_pool *pool = nullptr;
VkCommandBuffer commands = nullptr;
@ -1074,6 +1077,53 @@ namespace vk
{
return commands;
}
void begin()
{
if (is_open)
return;
VkCommandBufferInheritanceInfo inheritance_info = {};
inheritance_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO;
VkCommandBufferBeginInfo begin_infos = {};
begin_infos.pInheritanceInfo = &inheritance_info;
begin_infos.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
begin_infos.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
CHECK_RESULT(vkBeginCommandBuffer(commands, &begin_infos));
is_open = true;
}
void end()
{
if (!is_open)
{
LOG_ERROR(RSX, "commandbuffer->end was called but commandbuffer is not in a recording state");
return;
}
CHECK_RESULT(vkEndCommandBuffer(commands));
is_open = false;
}
void submit(VkQueue queue, const std::vector<VkSemaphore> &semaphores, VkFence fence, VkPipelineStageFlags pipeline_stage_flags)
{
if (is_open)
{
LOG_ERROR(RSX, "commandbuffer->submit was called whilst the command buffer is in a recording state");
return;
}
VkSubmitInfo infos = {};
infos.commandBufferCount = 1;
infos.pCommandBuffers = &commands;
infos.pWaitDstStageMask = &pipeline_stage_flags;
infos.pWaitSemaphores = semaphores.data();
infos.waitSemaphoreCount = static_cast<uint32_t>(semaphores.size());
infos.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
CHECK_RESULT(vkQueueSubmit(queue, 1, &infos, fence));
}
};
class context

View File

@ -183,16 +183,7 @@ namespace vk
if (manage_cb_lifetime)
{
//cb has to be guaranteed to be in a closed state
//This function can be called asynchronously
VkCommandBufferInheritanceInfo inheritance_info = {};
inheritance_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO;
VkCommandBufferBeginInfo begin_infos = {};
begin_infos.pInheritanceInfo = &inheritance_info;
begin_infos.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
begin_infos.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
CHECK_RESULT(vkBeginCommandBuffer(cmd, &begin_infos));
cmd.begin();
}
VkBufferImageCopy copyRegion = {};
@ -212,20 +203,8 @@ namespace vk
if (manage_cb_lifetime)
{
CHECK_RESULT(vkEndCommandBuffer(cmd));
VkPipelineStageFlags pipe_stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
VkCommandBuffer command_buffer = cmd;
VkSubmitInfo infos = {};
infos.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
infos.commandBufferCount = 1;
infos.pCommandBuffers = &command_buffer;
infos.pWaitDstStageMask = &pipe_stage_flags;
infos.pWaitSemaphores = nullptr;
infos.waitSemaphoreCount = 0;
CHECK_RESULT(vkQueueSubmit(submit_queue, 1, &infos, dma_fence));
cmd.end();
cmd.submit(submit_queue, {}, dma_fence, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
//Now we need to restart the command-buffer to restore it to the way it was before...
CHECK_RESULT(vkWaitForFences(*m_device, 1, &dma_fence, VK_TRUE, UINT64_MAX));
@ -699,6 +678,25 @@ namespace vk
VkImageAspectFlagBits aspect = VK_IMAGE_ASPECT_COLOR_BIT;
if (is_depth) aspect = (VkImageAspectFlagBits)(src->info.format == VK_FORMAT_D16_UNORM ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT);
//Checks
if (src_area.x2 <= src_area.x1 || src_area.y2 <= src_area.y1 || dst_area.x2 <= dst_area.x1 || dst_area.y2 <= dst_area.y1)
{
LOG_ERROR(RSX, "Blit request consists of an empty region descriptor!");
return;
}
if (src_area.x1 < 0 || src_area.x2 > (s32)src->width() || src_area.y1 < 0 || src_area.y2 > (s32)src->height())
{
LOG_ERROR(RSX, "Blit request denied because the source region does not fit!");
return;
}
if (dst_area.x1 < 0 || dst_area.x2 > (s32)dst->width() || dst_area.y1 < 0 || dst_area.y2 > (s32)dst->height())
{
LOG_ERROR(RSX, "Blit request denied because the destination region does not fit!");
return;
}
copy_scaled_image(*commands, src->value, dst->value, src->current_layout, dst->current_layout, src_area.x1, src_area.y1, src_area.x2 - src_area.x1, src_area.y2 - src_area.y1,
dst_area.x1, dst_area.y1, dst_area.x2 - dst_area.x1, dst_area.y2 - dst_area.y1, 1, aspect);

View File

@ -38,6 +38,7 @@ namespace rsx
u16 clip_y;
u16 clip_width;
u16 clip_height;
u16 max_tile_h;
f32 scale_x;
f32 scale_y;

View File

@ -661,6 +661,8 @@ namespace rsx
dst_info.compressed_x = true;
break;
}
dst_info.max_tile_h = static_cast<u16>((dst_region.tile->size - dst_region.base) / out_pitch);
}
if (dst_dma == CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER)