rsx: Experiments with nul sink

This commit is contained in:
kd-11 2019-09-04 22:19:58 +03:00 committed by kd-11
parent 212ac19c11
commit 858014b718
12 changed files with 420 additions and 301 deletions

View File

@ -12,7 +12,8 @@ namespace rsx
shader_read = 1,
blit_engine_src = 2,
blit_engine_dst = 4,
framebuffer_storage = 8
framebuffer_storage = 8,
dma = 16
};
enum texture_colorspace : u32

View File

@ -775,6 +775,9 @@ namespace rsx
continue;
auto surface = tex_info.second.get();
if (access == rsx::surface_access::transfer && surface->write_through())
continue;
if (!rsx::pitch_compatible(surface, required_pitch, required_height))
continue;

View File

@ -309,6 +309,11 @@ namespace rsx
return (state_flags != rsx::surface_state_flags::ready) || !old_contents.empty();
}
bool write_through() const
{
return (state_flags & rsx::surface_state_flags::erase_bkgnd) && old_contents.empty();
}
#if (ENABLE_SURFACE_CACHE_DEBUG)
u64 hash_block() const
{

View File

@ -362,6 +362,7 @@ namespace rsx
rsx::texture_upload_context context, rsx::texture_dimension_extended type, texture_create_flags flags) = 0;
virtual section_storage_type* upload_image_from_cpu(commandbuffer_type&, const address_range &rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, u32 gcm_format, texture_upload_context context,
const std::vector<rsx_subresource_layout>& subresource_layout, rsx::texture_dimension_extended type, bool swizzled) = 0;
virtual section_storage_type* create_nul_section(commandbuffer_type&, const address_range &rsx_range, bool memory_load) = 0;
virtual void enforce_surface_creation_type(section_storage_type& section, u32 gcm_format, texture_create_flags expected) = 0;
virtual void insert_texture_barrier(commandbuffer_type&, image_storage_type* tex) = 0;
virtual image_view_type generate_cubemap_from_images(commandbuffer_type&, u32 gcm_format, u16 size, const std::vector<copy_region_descriptor>& sources, const texture_channel_remap_t& remap_vector) = 0;
@ -2429,6 +2430,7 @@ namespace rsx
// Check if src/dst are parts of render targets
typename surface_store_type::surface_overlap_info dst_subres;
bool use_null_region = false;
if (dst_address > 0xc0000000)
{
// TODO: HACK
@ -2442,6 +2444,7 @@ namespace rsx
// 1. Invalidate surfaces in range
// 2. Proceed as normal, blit into a 'normal' surface and any upload routines should catch it
m_rtts.invalidate_range(utils::address_range::start_length(dst_address, dst.pitch * dst_h));
use_null_region = (scale_x == 1.f && scale_y == 1.f);
}
// TODO: Handle cases where src or dst can be a depth texture while the other is a color texture - requires a render pass to emulate
@ -2545,7 +2548,9 @@ namespace rsx
if (!dst_is_render_target)
{
// Check for any available region that will fit this one
auto overlapping_surfaces = find_texture_from_range(address_range::start_length(dst_address, dst.pitch * dst.clip_height), dst.pitch, rsx::texture_upload_context::blit_engine_dst);
const auto required_type = (use_null_region) ? texture_upload_context::dma : texture_upload_context::blit_engine_dst;
const auto dst_range = address_range::start_length(dst_address, dst.pitch * dst.clip_height);
auto overlapping_surfaces = find_texture_from_range(dst_range, dst.pitch, required_type);
for (const auto &surface : overlapping_surfaces)
{
if (!surface->is_locked())
@ -2561,6 +2566,17 @@ namespace rsx
continue;
}
if (use_null_region)
{
if (dst_range.inside(surface->get_section_range()))
{
// Attach to existing region
cached_dest = surface;
}
continue;
}
const auto this_address = surface->get_section_base();
if (this_address > dst_address)
{
@ -2609,9 +2625,9 @@ namespace rsx
// Check if available target is acceptable
// TODO: Check for other types of format mismatch
bool format_mismatch = false;
if (cached_dest)
if (cached_dest && !use_null_region)
{
bool format_mismatch = false;
if (cached_dest->is_depth_texture() != src_subres.is_depth)
{
// Dest surface has the wrong 'aspect'
@ -2635,14 +2651,14 @@ namespace rsx
break;
}
}
}
if (format_mismatch)
{
// The invalidate call before creating a new target will remove this section
cached_dest = nullptr;
dest_texture = 0;
dst_area = old_dst_area;
if (format_mismatch)
{
// The invalidate call before creating a new target will remove this section
cached_dest = nullptr;
dest_texture = 0;
dst_area = old_dst_area;
}
}
// Create source texture if does not exist
@ -2795,7 +2811,7 @@ namespace rsx
else
gcm_format = (dst_is_argb8) ? CELL_GCM_TEXTURE_A8R8G8B8 : CELL_GCM_TEXTURE_R5G6B5;
if (cached_dest)
if (cached_dest && !use_null_region)
{
// Prep surface
auto channel_order = src_is_render_target ? rsx::texture_create_flags::native_component_order :
@ -2847,9 +2863,9 @@ namespace rsx
const auto modified_range = utils::address_range::start_length(dst_address, mem_length);
if (dest_texture == 0)
if (!cached_dest && !dst_is_render_target)
{
verify(HERE), !dst_is_render_target;
verify(HERE), !dest_texture;
// Need to calculate the minium required size that will fit the data, anchored on the rsx_address
// If the application starts off with an 'inseted' section, the guessed dimensions may not fit!
@ -2859,55 +2875,72 @@ namespace rsx
const u32 section_length = std::max(write_end, expected_end) - dst.rsx_address;
dst_dimensions.height = section_length / dst.pitch;
// render target data is already in correct swizzle layout
auto channel_order = src_is_render_target ? rsx::texture_create_flags::native_component_order :
dst_is_argb8 ? rsx::texture_create_flags::default_component_order :
rsx::texture_create_flags::swapped_native_component_order;
// Translate dst_area into the 'full' dst block based on dst.rsx_address as (0, 0)
dst_area.x1 += dst.offset_x;
dst_area.x2 += dst.offset_x;
dst_area.y1 += dst.offset_y;
dst_area.y2 += dst.offset_y;
lock.upgrade();
// NOTE: Write flag set to remove all other overlapping regions (e.g shader_read or blit_src)
const auto rsx_range = address_range::start_length(dst.rsx_address, section_length);
invalidate_range_impl_base(cmd, rsx_range, invalidation_cause::write, std::forward<Args>(extras)...);
if (!dst_area.x1 && !dst_area.y1 && dst_area.x2 == dst_dimensions.width && dst_area.y2 == dst_dimensions.height)
if (LIKELY(use_null_region))
{
cached_dest = create_new_texture(cmd, rsx_range, dst_dimensions.width, dst_dimensions.height, 1, 1, dst.pitch,
gcm_format, rsx::texture_upload_context::blit_engine_dst, rsx::texture_dimension_extended::texture_dimension_2d,
channel_order);
bool force_dma_load = false;
if ((dst_w * dst_bpp) != dst.pitch)
{
// Keep Cell from touching the range we need
const auto prot_range = modified_range.to_page_range();
utils::memory_protect(vm::base(prot_range.start), prot_range.length(), utils::protection::no);
force_dma_load = true;
}
cached_dest = create_nul_section(cmd, rsx_range, force_dma_load);
}
else
{
// HACK: workaround for data race with Cell
// Pre-lock the memory range we'll be touching, then load with super_ptr
const auto prot_range = modified_range.to_page_range();
utils::memory_protect(vm::base(prot_range.start), prot_range.length(), utils::protection::no);
// render target data is already in correct swizzle layout
auto channel_order = src_is_render_target ? rsx::texture_create_flags::native_component_order :
dst_is_argb8 ? rsx::texture_create_flags::default_component_order :
rsx::texture_create_flags::swapped_native_component_order;
const u16 pitch_in_block = dst.pitch / dst_bpp;
std::vector<rsx_subresource_layout> subresource_layout;
rsx_subresource_layout subres = {};
subres.width_in_block = dst_dimensions.width;
subres.height_in_block = dst_dimensions.height;
subres.pitch_in_block = pitch_in_block;
subres.depth = 1;
subres.data = { reinterpret_cast<const gsl::byte*>(vm::get_super_ptr(dst.rsx_address)), dst.pitch * dst_dimensions.height };
subresource_layout.push_back(subres);
// Translate dst_area into the 'full' dst block based on dst.rsx_address as (0, 0)
dst_area.x1 += dst.offset_x;
dst_area.x2 += dst.offset_x;
dst_area.y1 += dst.offset_y;
dst_area.y2 += dst.offset_y;
cached_dest = upload_image_from_cpu(cmd, rsx_range, dst_dimensions.width, dst_dimensions.height, 1, 1, dst.pitch,
gcm_format, rsx::texture_upload_context::blit_engine_dst, subresource_layout,
rsx::texture_dimension_extended::texture_dimension_2d, false);
if (!dst_area.x1 && !dst_area.y1 && dst_area.x2 == dst_dimensions.width && dst_area.y2 == dst_dimensions.height)
{
cached_dest = create_new_texture(cmd, rsx_range, dst_dimensions.width, dst_dimensions.height, 1, 1, dst.pitch,
gcm_format, rsx::texture_upload_context::blit_engine_dst, rsx::texture_dimension_extended::texture_dimension_2d,
channel_order);
}
else
{
// HACK: workaround for data race with Cell
// Pre-lock the memory range we'll be touching, then load with super_ptr
const auto prot_range = modified_range.to_page_range();
utils::memory_protect(vm::base(prot_range.start), prot_range.length(), utils::protection::no);
enforce_surface_creation_type(*cached_dest, gcm_format, channel_order);
const u16 pitch_in_block = dst.pitch / dst_bpp;
std::vector<rsx_subresource_layout> subresource_layout;
rsx_subresource_layout subres = {};
subres.width_in_block = dst_dimensions.width;
subres.height_in_block = dst_dimensions.height;
subres.pitch_in_block = pitch_in_block;
subres.depth = 1;
subres.data = { reinterpret_cast<const gsl::byte*>(vm::get_super_ptr(dst.rsx_address)), dst.pitch * dst_dimensions.height };
subresource_layout.push_back(subres);
cached_dest = upload_image_from_cpu(cmd, rsx_range, dst_dimensions.width, dst_dimensions.height, 1, 1, dst.pitch,
gcm_format, rsx::texture_upload_context::blit_engine_dst, subresource_layout,
rsx::texture_dimension_extended::texture_dimension_2d, false);
enforce_surface_creation_type(*cached_dest, gcm_format, channel_order);
}
dest_texture = cached_dest->get_raw_texture();
typeless_info.dst_context = texture_upload_context::blit_engine_dst;
}
dest_texture = cached_dest->get_raw_texture();
typeless_info.dst_context = texture_upload_context::blit_engine_dst;
}
verify(HERE), cached_dest || dst_is_render_target;
@ -2979,8 +3012,15 @@ namespace rsx
dst_subres.surface->transform_blit_coordinates(rsx::surface_access::transfer, dst_area);
}
typeless_info.analyse();
blitter.scale_image(cmd, vram_texture, dest_texture, src_area, dst_area, interpolate, is_depth_blit, typeless_info);
if (!use_null_region)
{
typeless_info.analyse();
blitter.scale_image(cmd, vram_texture, dest_texture, src_area, dst_area, interpolate, is_depth_blit, typeless_info);
}
else
{
cached_dest->dma_transfer(cmd, vram_texture, src_area, modified_range, dst.pitch);
}
blit_op_result result = true;
result.is_depth = is_depth_blit;

View File

@ -1504,7 +1504,7 @@ namespace rsx
void add_flush_exclusion(const address_range& rng)
{
AUDIT(exists() && is_locked() && is_flushable());
AUDIT(is_locked() && is_flushable());
const auto _rng = rng.get_intersect(get_section_range());
flush_exclusions.merge(_rng);
}
@ -1710,7 +1710,14 @@ namespace rsx
bool exists() const
{
return derived()->exists();
if (derived()->exists())
{
return true;
}
else
{
return (context == rsx::texture_upload_context::dma && is_locked());
}
}
};

View File

@ -883,7 +883,7 @@ namespace gl
void data(GLsizeiptr size, const void* data_ = nullptr, GLenum usage = GL_STREAM_DRAW)
{
verify(HERE), m_memory_type == memory_type::undefined;
verify(HERE), m_memory_type != memory_type::local;
target target_ = current_target();
save_binding_state save(target_, *this);

View File

@ -61,72 +61,6 @@ namespace gl
texture::format format = texture::format::rgba;
texture::type type = texture::type::ubyte;
u8 get_pixel_size(texture::format fmt_, texture::type type_)
{
u8 size = 1;
switch (type_)
{
case texture::type::ubyte:
case texture::type::sbyte:
break;
case texture::type::ushort:
case texture::type::sshort:
case texture::type::f16:
size = 2;
break;
case texture::type::ushort_5_6_5:
case texture::type::ushort_5_6_5_rev:
case texture::type::ushort_4_4_4_4:
case texture::type::ushort_4_4_4_4_rev:
case texture::type::ushort_5_5_5_1:
case texture::type::ushort_1_5_5_5_rev:
return 2;
case texture::type::uint_8_8_8_8:
case texture::type::uint_8_8_8_8_rev:
case texture::type::uint_10_10_10_2:
case texture::type::uint_2_10_10_10_rev:
case texture::type::uint_24_8:
return 4;
case texture::type::f32:
case texture::type::sint:
case texture::type::uint:
size = 4;
break;
default:
LOG_ERROR(RSX, "Unsupported texture type");
}
switch (fmt_)
{
case texture::format::r:
break;
case texture::format::rg:
size *= 2;
break;
case texture::format::rgb:
case texture::format::bgr:
size *= 3;
break;
case texture::format::rgba:
case texture::format::bgra:
size *= 4;
break;
//Depth formats..
case texture::format::depth:
size = 2;
break;
case texture::format::depth_stencil:
size = 4;
break;
default:
LOG_ERROR(RSX, "Unsupported rtt format %d", (GLenum)fmt_);
size = 4;
}
return size;
}
void init_buffer(const gl::texture* src)
{
const u32 vram_size = src->pitch() * src->height();
@ -218,6 +152,61 @@ namespace gl
}
}
void dma_transfer(gl::command_context& cmd, gl::texture* src, const areai& /*src_area*/, const utils::address_range& /*valid_range*/, u32 pitch)
{
init_buffer(src);
glGetError();
glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo_id);
if (context == rsx::texture_upload_context::dma)
{
// Determine unpack config dynamically
const auto format_info = gl::get_format_type(src->get_internal_format());
format = static_cast<gl::texture::format>(std::get<0>(format_info));
type = static_cast<gl::texture::type>(std::get<1>(format_info));
if ((src->aspect() & gl::image_aspect::stencil) == 0)
{
pack_unpack_swap_bytes = std::get<2>(format_info);
}
else
{
// Z24S8 decode is done on the CPU for now
pack_unpack_swap_bytes = false;
}
}
pixel_pack_settings pack_settings;
pack_settings.alignment(1);
pack_settings.swap_bytes(pack_unpack_swap_bytes);
src->copy_to(nullptr, format, type, pack_settings);
real_pitch = src->pitch();
rsx_pitch = pitch;
if (auto error = glGetError())
{
if (error == GL_OUT_OF_MEMORY && ::gl::get_driver_caps().vendor_AMD)
{
// AMD driver bug
// Pixel transfer fails with GL_OUT_OF_MEMORY. Usually happens with float textures or operations attempting to swap endianness.
// Failed operations also leak a large amount of memory
LOG_ERROR(RSX, "Memory transfer failure (AMD bug). Please update your driver to Adrenalin 19.4.3 or newer. Format=0x%x, Type=0x%x, Swap=%d", (u32)format, (u32)type, pack_unpack_swap_bytes);
}
else
{
LOG_ERROR(RSX, "Memory transfer failed with error 0x%x. Format=0x%x, Type=0x%x", error, (u32)format, (u32)type);
}
}
glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
m_fence.reset();
synchronized = true;
sync_timestamp = get_system_time();
}
void copy_texture(gl::command_context& cmd, bool miss)
{
ASSERT(exists());
@ -284,38 +273,7 @@ namespace gl
}
}
init_buffer(target_texture);
glGetError();
glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo_id);
pixel_pack_settings pack_settings;
pack_settings.alignment(1);
pack_settings.swap_bytes(pack_unpack_swap_bytes);
target_texture->copy_to(nullptr, format, type, pack_settings);
real_pitch = target_texture->pitch();
if (auto error = glGetError())
{
if (error == GL_OUT_OF_MEMORY && ::gl::get_driver_caps().vendor_AMD)
{
// AMD driver bug
// Pixel transfer fails with GL_OUT_OF_MEMORY. Usually happens with float textures or operations attempting to swap endianness.
// Failed operations also leak a large amount of memory
LOG_ERROR(RSX, "Memory transfer failure (AMD bug). Please update your driver to Adrenalin 19.4.3 or newer. Format=0x%x, Type=0x%x, Swap=%d", (u32)format, (u32)type, pack_unpack_swap_bytes);
}
else
{
LOG_ERROR(RSX, "Memory transfer failed with error 0x%x. Format=0x%x, Type=0x%x", error, (u32)format, (u32)type);
}
}
glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
m_fence.reset();
synchronized = true;
sync_timestamp = get_system_time();
dma_transfer(cmd, target_texture, {}, {}, rsx_pitch);
}
void fill_texture(gl::texture* tex)
@ -889,6 +847,21 @@ namespace gl
return &cached;
}
cached_texture_section* create_nul_section(gl::command_context& cmd, const utils::address_range& rsx_range, bool memory_load) override
{
auto& cached = *find_cached_texture(rsx_range, RSX_GCM_FORMAT_IGNORED, true, false);
ASSERT(!cached.is_locked());
// Prepare section
cached.reset(rsx_range);
cached.set_context(rsx::texture_upload_context::dma);
cached.set_dirty(false);
no_access_range = cached.get_min_max(no_access_range, rsx::section_bounds::locked_range);
update_cache_tag();
return &cached;
}
cached_texture_section* upload_image_from_cpu(gl::command_context &cmd, const utils::address_range& rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, u32 gcm_format,
rsx::texture_upload_context context, const std::vector<rsx_subresource_layout>& subresource_layout, rsx::texture_dimension_extended type, bool input_swizzled) override
{

View File

@ -443,6 +443,7 @@ namespace vk
}
};
template<bool _SwapBytes = false>
struct cs_gather_d24x8 : cs_interleave_task
{
cs_gather_d24x8()
@ -456,13 +457,24 @@ namespace vk
" stencil_shift = (index % 4) * 8;\n"
" stencil = data[stencil_offset + s_offset];\n"
" stencil = (stencil >> stencil_shift) & 0xFF;\n"
" value = (depth << 8) | stencil;\n"
" value = (depth << 8) | stencil;\n";
if constexpr (!_SwapBytes)
{
work_kernel +=
" data[index] = value;\n";
}
else
{
work_kernel +=
" data[index] = bswap_u32(value);\n";
}
cs_shuffle_base::build("");
}
};
template<bool _SwapBytes = false>
struct cs_gather_d32x8 : cs_interleave_task
{
cs_gather_d32x8()
@ -476,8 +488,18 @@ namespace vk
" stencil_shift = (index % 4) * 8;\n"
" stencil = data[stencil_offset + s_offset];\n"
" stencil = (stencil >> stencil_shift) & 0xFF;\n"
" value = (depth << 8) | stencil;\n"
" value = (depth << 8) | stencil;\n";
if constexpr (!_SwapBytes)
{
work_kernel +=
" data[index] = value;\n";
}
else
{
work_kernel +=
" data[index] = bswap_u32(value);\n";
}
cs_shuffle_base::build("");
}

View File

@ -2949,7 +2949,7 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context)
const u32 gcm_format = (m_depth_surface_info.depth_format != rsx::surface_depth_format::z16) ? CELL_GCM_TEXTURE_DEPTH16 : CELL_GCM_TEXTURE_DEPTH24_D8;
m_texture_cache.lock_memory_region(
*m_current_command_buffer, m_rtts.m_bound_depth_stencil.second, surface_range, true,
m_depth_surface_info.width, m_depth_surface_info.height, m_framebuffer_layout.actual_zeta_pitch, gcm_format, false);
m_depth_surface_info.width, m_depth_surface_info.height, m_framebuffer_layout.actual_zeta_pitch, gcm_format, true);
}
else
{

View File

@ -148,7 +148,7 @@ namespace vk
void change_image_layout(VkCommandBuffer cmd, vk::image *image, VkImageLayout new_layout, const VkImageSubresourceRange& range);
void change_image_layout(VkCommandBuffer cmd, vk::image *image, VkImageLayout new_layout);
void copy_image_to_buffer(VkCommandBuffer cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region);
void copy_image_to_buffer(VkCommandBuffer cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region, bool swap_bytes = false);
void copy_buffer_to_image(VkCommandBuffer cmd, const vk::buffer* src, const vk::image* dst, const VkBufferImageCopy& region);
void copy_image_typeless(const command_buffer &cmd, image *src, image *dst, const areai& src_rect, const areai& dst_rect,

View File

@ -56,7 +56,7 @@ namespace vk
}
}
void copy_image_to_buffer(VkCommandBuffer cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region)
void copy_image_to_buffer(VkCommandBuffer cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region, bool swap_bytes)
{
// Always validate
verify("Invalid image layout!" HERE),
@ -66,6 +66,7 @@ namespace vk
{
default:
{
verify("Implicit byteswap option not supported for speficied format" HERE), !swap_bytes;
vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, dst->value, 1, &region);
break;
}
@ -83,8 +84,9 @@ namespace vk
const auto allocation_end = region.bufferOffset + packed_length + in_depth_size + in_stencil_size;
verify(HERE), dst->size() >= allocation_end;
const VkDeviceSize z_offset = align<VkDeviceSize>(region.bufferOffset + packed_length, 256);
const VkDeviceSize s_offset = align<VkDeviceSize>(z_offset + in_depth_size, 256);
const auto data_offset = u32(region.bufferOffset);
const auto z_offset = align<u32>(data_offset + packed_length, 256);
const auto s_offset = align<u32>(z_offset + in_depth_size, 256);
// 1. Copy the depth and stencil blocks to separate banks
VkBufferImageCopy sub_regions[2];
@ -97,20 +99,34 @@ namespace vk
// 2. Interleave the separated data blocks with a compute job
vk::cs_interleave_task *job;
if (src->format() == VK_FORMAT_D24_UNORM_S8_UINT)
if (LIKELY(!swap_bytes))
{
job = vk::get_compute_task<vk::cs_gather_d24x8>();
if (src->format() == VK_FORMAT_D24_UNORM_S8_UINT)
{
job = vk::get_compute_task<vk::cs_gather_d24x8<false>>();
}
else
{
job = vk::get_compute_task<vk::cs_gather_d32x8<false>>();
}
}
else
{
job = vk::get_compute_task<vk::cs_gather_d32x8>();
if (src->format() == VK_FORMAT_D24_UNORM_S8_UINT)
{
job = vk::get_compute_task<vk::cs_gather_d24x8<true>>();
}
else
{
job = vk::get_compute_task<vk::cs_gather_d32x8<true>>();
}
}
vk::insert_buffer_memory_barrier(cmd, dst->value, z_offset, in_depth_size + in_stencil_size,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
job->run(cmd, dst, (u32)region.bufferOffset, packed_length, (u32)z_offset, (u32)s_offset);
job->run(cmd, dst, data_offset, packed_length, z_offset, s_offset);
vk::insert_buffer_memory_barrier(cmd, dst->value, region.bufferOffset, packed_length,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
@ -145,8 +161,9 @@ namespace vk
const auto allocation_end = region.bufferOffset + packed_length + in_depth_size + in_stencil_size;
verify("Out of memory (compute heap). Lower your resolution scale setting." HERE), src->size() >= allocation_end;
const VkDeviceSize z_offset = align<VkDeviceSize>(region.bufferOffset + packed_length, 256);
const VkDeviceSize s_offset = align<VkDeviceSize>(z_offset + in_depth_size, 256);
const auto data_offset = u32(region.bufferOffset);
const auto z_offset = align<u32>(data_offset + packed_length, 256);
const auto s_offset = align<u32>(z_offset + in_depth_size, 256);
// Zero out the stencil block
vkCmdFillBuffer(cmd, src->value, s_offset, in_stencil_size, 0);
@ -166,7 +183,7 @@ namespace vk
job = vk::get_compute_task<vk::cs_scatter_d32x8>();
}
job->run(cmd, src, (u32)region.bufferOffset, packed_length, (u32)z_offset, (u32)s_offset);
job->run(cmd, src, data_offset, packed_length, z_offset, s_offset);
vk::insert_buffer_memory_barrier(cmd, src->value, z_offset, in_depth_size + in_stencil_size,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,

View File

@ -151,8 +151,13 @@ namespace vk
VkFormat get_format() const
{
if (context == rsx::texture_upload_context::dma)
{
return VK_FORMAT_R32_UINT;
}
ASSERT(vram_texture != nullptr);
return vram_texture->info.format;
return vram_texture->format();
}
bool is_flushed() const
@ -161,18 +166,9 @@ namespace vk
return flushed;
}
void copy_texture(vk::command_buffer& cmd, bool miss)
void dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch)
{
ASSERT(exists());
if (LIKELY(!miss))
{
baseclass::on_speculative_flush();
}
else
{
baseclass::on_miss();
}
verify(HERE), src->samples() == 1;
if (m_device == nullptr)
{
@ -186,9 +182,146 @@ namespace vk
vkCreateEvent(*m_device, &createInfo, nullptr, &dma_fence);
}
src->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
const auto internal_bpp = vk::get_format_texel_width(src->format());
const auto transfer_width = (u32)src_area.width();
const auto transfer_height = (u32)src_area.height();
real_pitch = internal_bpp * transfer_width;
rsx_pitch = pitch;
const bool is_depth_stencil = !!(src->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT);
if (is_depth_stencil || pack_unpack_swap_bytes)
{
const auto section_length = valid_range.length();
const auto transfer_pitch = real_pitch;
const auto task_length = transfer_pitch * src_area.height();
auto working_buffer = vk::get_scratch_buffer();
auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length);
VkBufferImageCopy region = {};
region.imageSubresource = { src->aspect(), 0, 0, 1 };
region.imageOffset = { src_area.x1, src_area.y1, 0 };
region.imageExtent = { transfer_width, transfer_height, 1 };
vk::copy_image_to_buffer(cmd, src, working_buffer, region, (is_depth_stencil && pack_unpack_swap_bytes));
// NOTE: For depth-stencil formats, copying to buffer and byteswap are combined into one step above
if (pack_unpack_swap_bytes && !is_depth_stencil)
{
const auto texel_layout = vk::get_format_element_size(src->format());
const auto elem_size = texel_layout.first;
vk::cs_shuffle_base *shuffle_kernel;
if (elem_size == 2)
{
shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_16>();
}
else if (elem_size == 4)
{
shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
}
else
{
fmt::throw_exception("Unreachable" HERE);
}
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
shuffle_kernel->run(cmd, working_buffer, task_length);
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
}
if (LIKELY(rsx_pitch == real_pitch))
{
VkBufferCopy copy = {};
copy.dstOffset = final_mapping.first;
copy.size = section_length;
vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, &copy);
}
else
{
std::vector<VkBufferCopy> copy;
copy.reserve(transfer_height);
u32 dst_offset = final_mapping.first;
u32 src_offset = 0;
for (unsigned row = 0; row < transfer_height; ++row)
{
copy.push_back({ src_offset, dst_offset, transfer_pitch });
src_offset += real_pitch;
dst_offset += rsx_pitch;
}
vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data());
}
}
else
{
VkBufferImageCopy region = {};
region.bufferRowLength = (rsx_pitch / internal_bpp);
region.imageSubresource = { src->aspect(), 0, 0, 1 };
region.imageOffset = { src_area.x1, src_area.y1, 0 };
region.imageExtent = { transfer_width, transfer_height, 1 };
auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length());
region.bufferOffset = mapping.first;
vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, mapping.second->value, 1, &region);
}
src->pop_layout(cmd);
if (UNLIKELY(synchronized))
{
// Replace the wait event with a new one to avoid premature signaling!
vk::get_resource_manager()->dispose(dma_fence);
VkEventCreateInfo createInfo = {};
createInfo.sType = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO;
vkCreateEvent(*m_device, &createInfo, nullptr, &dma_fence);
}
else
{
// If this is speculated, it should only occur once
verify(HERE), vkGetEventStatus(*m_device, dma_fence) == VK_EVENT_RESET;
}
cmd.set_flag(vk::command_buffer::cb_has_dma_transfer);
vkCmdSetEvent(cmd, dma_fence, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT);
synchronized = true;
sync_timestamp = get_system_time();
}
void copy_texture(vk::command_buffer& cmd, bool miss)
{
ASSERT(exists());
if (LIKELY(!miss))
{
verify(HERE), !synchronized;
baseclass::on_speculative_flush();
}
else
{
baseclass::on_miss();
}
if (m_device == nullptr)
{
m_device = &cmd.get_command_pool().get_owner();
}
vk::image *locked_resource = vram_texture;
u32 transfer_width = width;
u32 transfer_height = height;
u32 transfer_x = 0, transfer_y = 0;
if (context == rsx::texture_upload_context::framebuffer_storage)
{
@ -199,12 +332,7 @@ namespace vk
transfer_height *= surface->samples_y;
}
verify(HERE), locked_resource->samples() == 1;
vk::image* target = locked_resource;
locked_resource->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
real_pitch = vk::get_format_texel_width(locked_resource->info.format) * locked_resource->width();
if (transfer_width != locked_resource->width() || transfer_height != locked_resource->height())
{
// TODO: Synchronize access to typeles textures
@ -221,14 +349,9 @@ namespace vk
target->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
}
verify(HERE), target->current_layout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
// TODO: Read back stencil values (is this really necessary?)
const auto internal_bpp = vk::get_format_texel_width(vram_texture->format());
const auto valid_range = get_confirmed_range();
real_pitch = internal_bpp * transfer_width;
u32 transfer_x = 0, transfer_y = 0;
if (const auto section_range = get_section_range(); section_range != valid_range)
{
if (const auto offset = (valid_range.start - get_section_base()))
@ -250,111 +373,12 @@ namespace vk
}
}
if ((vram_texture->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) ||
pack_unpack_swap_bytes)
{
const auto section_length = valid_range.length();
const auto transfer_pitch = transfer_width * internal_bpp;
const auto task_length = transfer_pitch * transfer_height;
auto working_buffer = vk::get_scratch_buffer();
auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length);
VkBufferImageCopy region = {};
region.imageSubresource = { vram_texture->aspect(), 0, 0, 1 };
region.imageOffset = { (s32)transfer_x, (s32)transfer_y, 0 };
region.imageExtent = { transfer_width, transfer_height, 1 };
vk::copy_image_to_buffer(cmd, target, working_buffer, region);
const auto texel_layout = vk::get_format_element_size(vram_texture->format());
const auto elem_size = texel_layout.first;
vk::cs_shuffle_base *shuffle_kernel;
if (elem_size == 2)
{
shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_16>();
}
else if (elem_size == 4)
{
shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
}
else
{
fmt::throw_exception("Unreachable" HERE);
}
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
shuffle_kernel->run(cmd, working_buffer, task_length);
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
if (LIKELY(rsx_pitch == real_pitch))
{
VkBufferCopy copy = {};
copy.dstOffset = final_mapping.first;
copy.size = section_length;
vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, &copy);
}
else
{
std::vector<VkBufferCopy> copy;
copy.reserve(transfer_height);
u32 dst_offset = final_mapping.first;
u32 src_offset = 0;
for (unsigned row = 0; row < transfer_height; ++row)
{
copy.push_back({src_offset, dst_offset, transfer_pitch});
src_offset += real_pitch;
dst_offset += rsx_pitch;
}
vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data());
}
}
else
{
VkBufferImageCopy region = {};
region.bufferRowLength = (rsx_pitch / internal_bpp);
region.imageSubresource = { vram_texture->aspect(), 0, 0, 1 };
region.imageOffset = { (s32)transfer_x, (s32)transfer_y, 0 };
region.imageExtent = { transfer_width, transfer_height, 1 };
auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length());
region.bufferOffset = mapping.first;
vkCmdCopyImageToBuffer(cmd, target->value, target->current_layout, mapping.second->value, 1, &region);
}
locked_resource->pop_layout(cmd);
if (UNLIKELY(synchronized))
{
verify(HERE), miss;
// Replace the wait event with a new one to avoid premature signaling!
vk::get_resource_manager()->dispose(dma_fence);
VkEventCreateInfo createInfo = {};
createInfo.sType = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO;
vkCreateEvent(*m_device, &createInfo, nullptr, &dma_fence);
}
else
{
// If this is speculated, it should only occur once
verify(HERE), vkGetEventStatus(*m_device, dma_fence) == VK_EVENT_RESET;
}
cmd.set_flag(vk::command_buffer::cb_has_dma_transfer);
vkCmdSetEvent(cmd, dma_fence, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT);
synchronized = true;
sync_timestamp = get_system_time();
areai src_area;
src_area.x1 = (s32)transfer_x;
src_area.y1 = (s32)transfer_y;
src_area.x2 = s32(transfer_x + transfer_width);
src_area.y2 = s32(transfer_y + transfer_height);
dma_transfer(cmd, target, src_area, valid_range, rsx_pitch);
}
/**
@ -1079,24 +1103,51 @@ namespace vk
region.create(width, height, section_depth, mipmaps, image, pitch, true, gcm_format);
region.set_dirty(false);
//Its not necessary to lock blit dst textures as they are just reused as necessary
if (context != rsx::texture_upload_context::blit_engine_dst)
// Its not necessary to lock blit dst textures as they are just reused as necessary
switch (context)
{
case rsx::texture_upload_context::shader_read:
case rsx::texture_upload_context::blit_engine_src:
region.protect(utils::protection::ro);
read_only_range = region.get_min_max(read_only_range, rsx::section_bounds::locked_range);
}
else
{
//TODO: Confirm byte swap patterns
//NOTE: Protection is handled by the caller
region.set_unpack_swap_bytes((aspect_flags & VK_IMAGE_ASPECT_COLOR_BIT) == VK_IMAGE_ASPECT_COLOR_BIT);
break;
case rsx::texture_upload_context::blit_engine_dst:
region.set_unpack_swap_bytes(true);
no_access_range = region.get_min_max(no_access_range, rsx::section_bounds::locked_range);
break;
case rsx::texture_upload_context::dma:
case rsx::texture_upload_context::framebuffer_storage:
// Should not initialized with this method
default:
fmt::throw_exception("Unexpected upload context 0x%x", u32(context));
}
update_cache_tag();
return &region;
}
cached_texture_section* create_nul_section(vk::command_buffer& cmd, const utils::address_range& rsx_range, bool memory_load) override
{
auto& region = *find_cached_texture(rsx_range, RSX_GCM_FORMAT_IGNORED, true, false);
ASSERT(!region.is_locked());
// Prepare section
region.reset(rsx_range);
region.set_context(rsx::texture_upload_context::dma);
region.set_dirty(false);
region.set_unpack_swap_bytes(true);
if (memory_load)
{
vk::map_dma(cmd, rsx_range.start, rsx_range.length());
vk::load_dma(rsx_range.start, rsx_range.length());
}
no_access_range = region.get_min_max(no_access_range, rsx::section_bounds::locked_range);
update_cache_tag();
return &region;
}
cached_texture_section* upload_image_from_cpu(vk::command_buffer& cmd, const utils::address_range& rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, u32 gcm_format,
rsx::texture_upload_context context, const std::vector<rsx_subresource_layout>& subresource_layout, rsx::texture_dimension_extended type, bool swizzled) override
{