diff --git a/src/xenia/gpu/sampler_info.cc b/src/xenia/gpu/sampler_info.cc index 0adefbd1b..c6fcf0985 100644 --- a/src/xenia/gpu/sampler_info.cc +++ b/src/xenia/gpu/sampler_info.cc @@ -44,6 +44,8 @@ bool SamplerInfo::Prepare(const xenos::xe_gpu_texture_fetch_t& fetch, out_info->border_color = static_cast(fetch.border_color); out_info->lod_bias = (fetch.lod_bias) / 32.f; + out_info->mip_min_level = fetch.mip_min_level; + out_info->mip_max_level = fetch.mip_max_level; return true; } diff --git a/src/xenia/gpu/sampler_info.h b/src/xenia/gpu/sampler_info.h index 57e3ceeac..415c28bd2 100644 --- a/src/xenia/gpu/sampler_info.h +++ b/src/xenia/gpu/sampler_info.h @@ -26,6 +26,8 @@ struct SamplerInfo { AnisoFilter aniso_filter; BorderColor border_color; float lod_bias; + uint32_t mip_min_level; + uint32_t mip_max_level; static bool Prepare(const xenos::xe_gpu_texture_fetch_t& fetch, const ParsedTextureFetchInstruction& fetch_instr, @@ -36,7 +38,9 @@ struct SamplerInfo { return min_filter == other.min_filter && mag_filter == other.mag_filter && mip_filter == other.mip_filter && clamp_u == other.clamp_u && clamp_v == other.clamp_v && clamp_w == other.clamp_w && - aniso_filter == other.aniso_filter; + aniso_filter == other.aniso_filter && lod_bias == other.lod_bias && + mip_min_level == other.mip_min_level && + mip_max_level == other.mip_max_level; } }; diff --git a/src/xenia/gpu/texture_info.cc b/src/xenia/gpu/texture_info.cc index 655249304..36883f8a4 100644 --- a/src/xenia/gpu/texture_info.cc +++ b/src/xenia/gpu/texture_info.cc @@ -15,6 +15,7 @@ #include "xenia/base/logging.h" #include "xenia/base/math.h" +#include "xenia/base/memory.h" #include "third_party/xxhash/xxhash.h" @@ -59,6 +60,8 @@ bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch, info.endianness = static_cast(fetch.endianness); info.is_tiled = fetch.tiled; info.has_packed_mips = fetch.packed_mips; + info.mip_address = fetch.mip_address << 12; + info.mip_levels = fetch.packed_mips ? fetch.mip_max_level + 1 : 1; info.input_length = 0; // Populated below. if (info.format_info()->format == TextureFormat::kUnknown) { @@ -78,6 +81,7 @@ bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch, } break; case Dimension::k3D: { // TODO(benvanik): calculate size. + assert_always(); return false; } case Dimension::kCube: { @@ -106,6 +110,8 @@ bool TextureInfo::PrepareResolve(uint32_t physical_address, info.endianness = endian; info.is_tiled = true; info.has_packed_mips = false; + info.mip_address = 0; + info.mip_levels = 1; info.input_length = 0; if (info.format_info()->format == TextureFormat::kUnknown) { @@ -121,10 +127,6 @@ void TextureInfo::CalculateTextureSizes2D(uint32_t width, uint32_t height) { size_2d.logical_width = width; size_2d.logical_height = height; - // Here be dragons. The values here are used in texture_cache.cc to copy - // images and create GL textures. Changes here will impact that code. - // TODO(benvanik): generic texture copying utility. - auto format = format_info(); // w/h in blocks. @@ -135,11 +137,15 @@ void TextureInfo::CalculateTextureSizes2D(uint32_t width, uint32_t height) { xe::round_up(size_2d.logical_height, format->block_height) / format->block_height; - // Tiles are 32x32 blocks. The pitch of all textures must a multiple of tile - // dimensions. - uint32_t tile_width = xe::round_up(block_width, 32) / 32; - size_2d.block_width = tile_width * 32; - size_2d.block_height = block_height; + if (is_tiled) { + // If the texture is tiled, its dimensions must be a multiple of tile + // dimensions (32x32 blocks). + size_2d.block_width = xe::round_up(block_width, 32); + size_2d.block_height = xe::round_up(block_height, 32); + } else { + size_2d.block_width = block_width; + size_2d.block_height = block_height; + } uint32_t bytes_per_block = format->block_width * format->block_height * format->bits_per_pixel / 8; @@ -177,11 +183,15 @@ void TextureInfo::CalculateTextureSizesCube(uint32_t width, uint32_t height, xe::round_up(size_cube.logical_height, format->block_height) / format->block_height; - // Tiles are 32x32 blocks. All textures must be multiples of tile dimensions. - uint32_t tile_width = xe::round_up(block_width, 32) / 32; - uint32_t tile_height = xe::round_up(block_height, 32) / 32; - size_cube.block_width = tile_width * 32; - size_cube.block_height = tile_height * 32; + if (is_tiled) { + // If the texture is tiled, its dimensions must be a multiple of tile + // dimensions (32x32 blocks). + size_cube.block_width = xe::round_up(block_width, 32); + size_cube.block_height = xe::round_up(block_height, 32); + } else { + size_cube.block_width = block_width; + size_cube.block_height = block_height; + } uint32_t bytes_per_block = format->block_width * format->block_height * format->bits_per_pixel / 8; @@ -204,12 +214,154 @@ void TextureInfo::CalculateTextureSizesCube(uint32_t width, uint32_t height, input_length = size_cube.input_face_length * 6; } +static void TextureSwap(Endian endianness, void* dest, const void* src, + size_t length) { + switch (endianness) { + case Endian::k8in16: + xe::copy_and_swap_16_unaligned(dest, src, length / 2); + break; + case Endian::k8in32: + xe::copy_and_swap_32_unaligned(dest, src, length / 4); + break; + case Endian::k16in32: // Swap high and low 16 bits within a 32 bit word + xe::copy_and_swap_16_in_32_unaligned(dest, src, length); + break; + default: + case Endian::kUnspecified: + std::memcpy(dest, src, length); + break; + } +} + +static void ConvertTexelCTX1(uint8_t* dest, size_t dest_pitch, + const uint8_t* src, Endian src_endianness) { + // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf + union { + uint8_t data[8]; + struct { + uint8_t r0, g0, r1, g1; + uint32_t xx; + }; + } block; + static_assert(sizeof(block) == 8, "CTX1 block mismatch"); + + const uint32_t bytes_per_block = 8; + TextureSwap(src_endianness, block.data, src, bytes_per_block); + + uint8_t cr[4] = { + block.r0, block.r1, + static_cast(2.f / 3.f * block.r0 + 1.f / 3.f * block.r1), + static_cast(1.f / 3.f * block.r0 + 2.f / 3.f * block.r1)}; + uint8_t cg[4] = { + block.g0, block.g1, + static_cast(2.f / 3.f * block.g0 + 1.f / 3.f * block.g1), + static_cast(1.f / 3.f * block.g0 + 2.f / 3.f * block.g1)}; + + for (uint32_t oy = 0; oy < 4; ++oy) { + for (uint32_t ox = 0; ox < 4; ++ox) { + uint8_t xx = (block.xx >> (((ox + (oy * 4)) * 2))) & 3; + dest[(oy * dest_pitch) + (ox * 2) + 0] = cr[xx]; + dest[(oy * dest_pitch) + (ox * 2) + 1] = cg[xx]; + } + } +} + +void TextureInfo::ConvertTiled(uint8_t* dest, const uint8_t* src, Endian endian, + const FormatInfo* format_info, uint32_t offset_x, + uint32_t offset_y, uint32_t block_pitch, + uint32_t width, uint32_t height, + uint32_t output_width) { + // TODO(benvanik): optimize this inner loop (or work by tiles). + uint32_t bytes_per_block = format_info->block_width * + format_info->block_height * + format_info->bits_per_pixel / 8; + + uint32_t output_pitch = + output_width * format_info->block_width * format_info->bits_per_pixel / 8; + + uint32_t output_row_height = 1; + if (format_info->format == TextureFormat::k_CTX1) { + // TODO: Can we calculate this? + output_row_height = 4; + } + + // logical w/h in blocks. + uint32_t block_width = + xe::round_up(width, format_info->block_width) / format_info->block_width; + uint32_t block_height = xe::round_up(height, format_info->block_height) / + format_info->block_height; + + // Bytes per pixel + auto log2_bpp = + (bytes_per_block / 4) + ((bytes_per_block / 2) >> (bytes_per_block / 4)); + + // Offset to the current row, in bytes. + uint32_t output_row_offset = 0; + for (uint32_t y = 0; y < block_height; y++) { + auto input_row_offset = + TextureInfo::TiledOffset2DOuter(offset_y + y, block_pitch, log2_bpp); + + // Go block-by-block on this row. + uint32_t output_offset = output_row_offset; + for (uint32_t x = 0; x < block_width; x++) { + auto input_offset = TextureInfo::TiledOffset2DInner( + offset_x + x, offset_y + y, log2_bpp, input_row_offset); + input_offset >>= log2_bpp; + + if (format_info->format == TextureFormat::k_CTX1) { + // Convert to R8G8. + ConvertTexelCTX1(&dest[output_offset], output_pitch, src, endian); + } else { + // Generic swap to destination. + TextureSwap(endian, dest + output_offset, + src + input_offset * bytes_per_block, bytes_per_block); + } + + output_offset += bytes_per_block; + } + + output_row_offset += output_pitch * output_row_height; + } +} + uint32_t TextureInfo::GetMaxMipLevels(uint32_t width, uint32_t height, uint32_t depth) { return 1 + xe::log2_floor(std::max({width, height, depth})); } -bool TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info, +uint32_t TextureInfo::GetMipLocation(const TextureInfo& src, uint32_t mip, + uint32_t* offset_x, uint32_t* offset_y) { + if (mip == 0) { + // Short-circuit. Mip 0 is always stored in guest_address. + GetPackedTileOffset(src, offset_x, offset_y); + return src.guest_address; + } + + // Walk forward to find the address of the mip + // If the texture is <= 16 pixels w/h, the mips are packed with the base + // texture. Otherwise, they're stored beginning from mip_address. + uint32_t address_base = std::min(src.width, src.height) < 16 + ? src.guest_address + : src.mip_address; + uint32_t address_offset = 0; + + for (uint32_t i = 1; i < mip; i++) { + uint32_t logical_width = std::max((src.width + 1) >> mip, 1u); + uint32_t logical_height = std::max((src.height + 1) >> mip, 1u); + if (std::min(logical_width, logical_height) <= 16) { + // We've reached the point where the mips are packed into a single tile. + // TODO(DrChat): Figure out how to calculate the packed tile offset. + continue; + } + + address_offset += src.input_length >> (i * 2); + } + + return address_base + address_offset; +} + +bool TextureInfo::GetPackedTileOffset(uint32_t width, uint32_t height, + const FormatInfo* format_info, uint32_t* out_offset_x, uint32_t* out_offset_y) { // Tile size is 32x32, and once textures go <=16 they are packed into a @@ -231,6 +383,13 @@ bool TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info, // This only works for square textures, or textures that are some non-pot // <= square. As soon as the aspect ratio goes weird, the textures start to // stretch across tiles. + // + // The 2x2 and 1x1 squares are packed in their specific positions because + // each square is the size of at least one block (which is 4x4 pixels max) + // 4x4: x = 4 + // 2x2: y = (x & 0x3) << 2 + // 1x1: y = (x & 0x3) << 2 + // // if (tile_aligned(w) > tile_aligned(h)) { // // wider than tall, so packed horizontally // } else if (tile_aligned(w) < tile_aligned(h)) { @@ -243,16 +402,14 @@ bool TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info, // The minimum dimension is what matters most: if either width or height // is <= 16 this mode kicks in. - if (std::min(texture_info.size_2d.logical_width, - texture_info.size_2d.logical_height) > 16) { + if (std::min(width, height) > 16) { // Too big, not packed. *out_offset_x = 0; *out_offset_y = 0; return false; } - if (xe::log2_ceil(texture_info.size_2d.logical_width) > - xe::log2_ceil(texture_info.size_2d.logical_height)) { + if (xe::log2_ceil(width) > xe::log2_ceil(height)) { // Wider than tall. Laid out vertically. *out_offset_x = 0; *out_offset_y = 16; @@ -261,26 +418,37 @@ bool TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info, *out_offset_x = 16; *out_offset_y = 0; } - *out_offset_x /= texture_info.format_info()->block_width; - *out_offset_y /= texture_info.format_info()->block_height; + + *out_offset_x /= format_info->block_width; + *out_offset_y /= format_info->block_height; return true; } +bool TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info, + uint32_t* out_offset_x, + uint32_t* out_offset_y) { + return GetPackedTileOffset( + texture_info.size_2d.logical_width, texture_info.size_2d.logical_height, + texture_info.format_info(), out_offset_x, out_offset_y); +} + // https://github.com/BinomialLLC/crunch/blob/ea9b8d8c00c8329791256adafa8cf11e4e7942a2/inc/crn_decomp.h#L4108 uint32_t TextureInfo::TiledOffset2DOuter(uint32_t y, uint32_t width, - uint32_t log_bpp) { - uint32_t macro = ((y >> 5) * (width >> 5)) << (log_bpp + 7); - uint32_t micro = ((y & 6) << 2) << log_bpp; - return macro + ((micro & ~15) << 1) + (micro & 15) + - ((y & 8) << (3 + log_bpp)) + ((y & 1) << 4); + uint32_t log2_bpp) { + uint32_t macro = ((y / 32) * (width / 32)) << (log2_bpp + 7); + uint32_t micro = ((y & 6) << 2) << log2_bpp; + return macro + ((micro & ~0xF) << 1) + (micro & 0xF) + + ((y & 8) << (3 + log2_bpp)) + ((y & 1) << 4); } -uint32_t TextureInfo::TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp, +uint32_t TextureInfo::TiledOffset2DInner(uint32_t x, uint32_t y, + uint32_t log2_bpp, uint32_t base_offset) { - uint32_t macro = (x >> 5) << (bpp + 7); - uint32_t micro = (x & 7) << bpp; - uint32_t offset = base_offset + (macro + ((micro & ~15) << 1) + (micro & 15)); - return ((offset & ~511) << 3) + ((offset & 448) << 2) + (offset & 63) + + uint32_t macro = (x / 32) << (log2_bpp + 7); + uint32_t micro = (x & 7) << log2_bpp; + uint32_t offset = + base_offset + (macro + ((micro & ~0xF) << 1) + (micro & 0xF)); + return ((offset & ~0x1FF) << 3) + ((offset & 0x1C0) << 2) + (offset & 0x3F) + ((y & 16) << 7) + (((((y & 8) >> 2) + (x >> 3)) & 3) << 6); } diff --git a/src/xenia/gpu/texture_info.h b/src/xenia/gpu/texture_info.h index 821d5134b..0f02a236a 100644 --- a/src/xenia/gpu/texture_info.h +++ b/src/xenia/gpu/texture_info.h @@ -256,6 +256,8 @@ struct TextureInfo { Endian endianness; bool is_tiled; bool has_packed_mips; + uint32_t mip_address; + uint32_t mip_levels; uint32_t input_length; const FormatInfo* format_info() const { @@ -304,14 +306,26 @@ struct TextureInfo { uint32_t width, uint32_t height, TextureInfo* out_info); + static void ConvertTiled(uint8_t* dest, const uint8_t* src, Endian endian, + const FormatInfo* format_info, uint32_t offset_x, + uint32_t offset_y, uint32_t block_pitch, + uint32_t width, uint32_t height, + uint32_t output_width); + static uint32_t GetMaxMipLevels(uint32_t width, uint32_t height, uint32_t depth); + static uint32_t GetMipLocation(const TextureInfo& src, uint32_t mip, + uint32_t* offset_x, uint32_t* offset_y); + static bool GetPackedTileOffset(uint32_t width, uint32_t height, + const FormatInfo* format_info, + uint32_t* out_offset_x, + uint32_t* out_offset_y); static bool GetPackedTileOffset(const TextureInfo& texture_info, uint32_t* out_offset_x, uint32_t* out_offset_y); static uint32_t TiledOffset2DOuter(uint32_t y, uint32_t width, - uint32_t log_bpp); - static uint32_t TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp, + uint32_t log2_bpp); + static uint32_t TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t log2_bpp, uint32_t base_offset); uint64_t hash() const; diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc index 9d433f6f1..3ea4afead 100644 --- a/src/xenia/gpu/vulkan/texture_cache.cc +++ b/src/xenia/gpu/vulkan/texture_cache.cc @@ -258,7 +258,13 @@ TextureCache::Texture* TextureCache::AllocateTexture( assert_not_null(texture_info.format_info()); auto& config = texture_configs[int(texture_info.format_info()->format)]; VkFormat format = config.host_format; - assert(format != VK_FORMAT_UNDEFINED); + if (format == VK_FORMAT_UNDEFINED) { + XELOGE( + "Texture Cache: Attempted to allocate texture format %s, which is " + "defined as VK_FORMAT_UNDEFINED!", + texture_info.format_info()->name); + return nullptr; + } image_info.tiling = VK_IMAGE_TILING_OPTIMAL; image_info.usage = @@ -302,7 +308,7 @@ TextureCache::Texture* TextureCache::AllocateTexture( image_info.format = format; image_info.extent = {texture_info.width + 1, texture_info.height + 1, 1}; - image_info.mipLevels = 1; + image_info.mipLevels = texture_info.mip_levels; image_info.arrayLayers = texture_info.depth + 1; image_info.samples = VK_SAMPLE_COUNT_1_BIT; image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; @@ -664,8 +670,6 @@ TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) { sampler_create_info.addressModeW = address_mode_map[static_cast(sampler_info.clamp_w)]; - sampler_create_info.mipLodBias = sampler_info.lod_bias; - float aniso = 0.f; switch (sampler_info.aniso_filter) { case AnisoFilter::kDisabled: @@ -697,8 +701,9 @@ TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) { sampler_create_info.compareEnable = VK_FALSE; sampler_create_info.compareOp = VK_COMPARE_OP_NEVER; - sampler_create_info.minLod = 0.0f; - sampler_create_info.maxLod = 0.0f; + sampler_create_info.mipLodBias = sampler_info.lod_bias; + sampler_create_info.minLod = float(sampler_info.mip_min_level); + sampler_create_info.maxLod = float(sampler_info.mip_max_level); sampler_create_info.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; sampler_create_info.unnormalizedCoordinates = VK_FALSE; VkSampler vk_sampler; @@ -816,13 +821,13 @@ void TextureSwap(Endian endianness, void* dest, const void* src, size_t length) { switch (endianness) { case Endian::k8in16: - xe::copy_and_swap_16_aligned(dest, src, length / 2); + xe::copy_and_swap_16_unaligned(dest, src, length / 2); break; case Endian::k8in32: - xe::copy_and_swap_32_aligned(dest, src, length / 4); + xe::copy_and_swap_32_unaligned(dest, src, length / 4); break; case Endian::k16in32: // Swap high and low 16 bits within a 32 bit word - xe::copy_and_swap_16_in_32_aligned(dest, src, length); + xe::copy_and_swap_16_in_32_unaligned(dest, src, length); break; default: case Endian::kUnspecified: @@ -867,43 +872,21 @@ void TextureCache::FlushPendingCommands(VkCommandBuffer command_buffer, vkBeginCommandBuffer(command_buffer, &begin_info); } -void TextureCache::ConvertTexelCTX1(uint8_t* dest, size_t dest_pitch, - const uint8_t* src, Endian src_endianness) { - // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf - union { - uint8_t data[8]; - struct { - uint8_t r0, g0, r1, g1; - uint32_t xx; - }; - } block; - static_assert(sizeof(block) == 8, "CTX1 block mismatch"); - - const uint32_t bytes_per_block = 8; - TextureSwap(src_endianness, block.data, src, bytes_per_block); - - uint8_t cr[4] = { - block.r0, block.r1, - static_cast(2.f / 3.f * block.r0 + 1.f / 3.f * block.r1), - static_cast(1.f / 3.f * block.r0 + 2.f / 3.f * block.r1)}; - uint8_t cg[4] = { - block.g0, block.g1, - static_cast(2.f / 3.f * block.g0 + 1.f / 3.f * block.g1), - static_cast(1.f / 3.f * block.g0 + 2.f / 3.f * block.g1)}; - - for (uint32_t oy = 0; oy < 4; ++oy) { - for (uint32_t ox = 0; ox < 4; ++ox) { - uint8_t xx = (block.xx >> (((ox + (oy * 4)) * 2))) & 3; - dest[(oy * dest_pitch) + (ox * 2) + 0] = cr[xx]; - dest[(oy * dest_pitch) + (ox * 2) + 1] = cg[xx]; - } - } -} - bool TextureCache::ConvertTexture2D(uint8_t* dest, VkBufferImageCopy* copy_region, - const TextureInfo& src) { - void* host_address = memory_->TranslatePhysical(src.guest_address); + uint32_t mip, const TextureInfo& src) { + uint32_t offset_x = 0; + uint32_t offset_y = 0; + uint32_t address = + TextureInfo::GetMipLocation(src, mip, &offset_x, &offset_y); + void* host_address = memory_->TranslatePhysical(address); + + uint32_t logical_width = src.size_2d.logical_width >> mip; + uint32_t logical_height = src.size_2d.logical_height >> mip; + uint32_t block_width = src.size_2d.block_width >> mip; + uint32_t input_width = src.size_2d.input_width >> mip; + uint32_t input_height = src.size_2d.input_height >> mip; + if (!src.is_tiled) { uint32_t offset_x, offset_y; if (src.has_packed_mips && @@ -922,89 +905,25 @@ bool TextureCache::ConvertTexture2D(uint8_t* dest, src_mem += src.size_2d.input_pitch; dest += src.size_2d.input_pitch; } - copy_region->bufferRowLength = src.size_2d.input_width; - copy_region->bufferImageHeight = src.size_2d.input_height; - copy_region->imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; - copy_region->imageExtent = {src.size_2d.logical_width, - src.size_2d.logical_height, 1}; - return true; } else { // Fast path copy entire image. TextureSwap(src.endianness, dest, host_address, src.input_length); - copy_region->bufferRowLength = src.size_2d.input_width; - copy_region->bufferImageHeight = src.size_2d.input_height; - copy_region->imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; - copy_region->imageExtent = {src.size_2d.logical_width, - src.size_2d.logical_height, 1}; - return true; } } else { // Untile image. // We could do this in a shader to speed things up, as this is pretty // slow. - - // TODO(benvanik): optimize this inner loop (or work by tiles). const uint8_t* src_mem = reinterpret_cast(host_address); - uint32_t bytes_per_block = src.format_info()->block_width * - src.format_info()->block_height * - src.format_info()->bits_per_pixel / 8; - - uint32_t output_pitch = src.size_2d.input_width * - src.format_info()->block_width * - src.format_info()->bits_per_pixel / 8; - - uint32_t output_row_height = 1; - if (src.texture_format == TextureFormat::k_CTX1) { - // TODO: Can we calculate this? - output_row_height = 4; - } - - // Tiled textures can be packed; get the offset into the packed texture. - uint32_t offset_x; - uint32_t offset_y; - TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); - auto log2_bpp = (bytes_per_block >> 2) + - ((bytes_per_block >> 1) >> (bytes_per_block >> 2)); - - // Offset to the current row, in bytes. - uint32_t output_row_offset = 0; - for (uint32_t y = 0; y < src.size_2d.block_height; y++) { - auto input_row_offset = TextureInfo::TiledOffset2DOuter( - offset_y + y, src.size_2d.block_width, log2_bpp); - - // Go block-by-block on this row. - uint32_t output_offset = output_row_offset; - for (uint32_t x = 0; x < src.size_2d.block_width; x++) { - auto input_offset = TextureInfo::TiledOffset2DInner( - offset_x + x, offset_y + y, log2_bpp, input_row_offset); - input_offset >>= log2_bpp; - - if (src.texture_format == TextureFormat::k_CTX1) { - // Convert to R8G8. - ConvertTexelCTX1(&dest[output_offset], output_pitch, src_mem, - src.endianness); - } else { - // Generic swap to destination. - TextureSwap(src.endianness, dest + output_offset, - src_mem + input_offset * bytes_per_block, - bytes_per_block); - } - - output_offset += bytes_per_block; - } - - output_row_offset += output_pitch * output_row_height; - } - - copy_region->bufferRowLength = src.size_2d.input_width; - copy_region->bufferImageHeight = src.size_2d.input_height; - copy_region->imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; - copy_region->imageExtent = {src.size_2d.logical_width, - src.size_2d.logical_height, 1}; - return true; + TextureInfo::ConvertTiled(dest, src_mem, src.endianness, src.format_info(), + offset_x, offset_y, block_width, logical_width, + logical_height, input_width); } - return false; + copy_region->bufferRowLength = input_width; + copy_region->bufferImageHeight = input_height; + copy_region->imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, mip, 0, 1}; + copy_region->imageExtent = {logical_width, logical_height, 1}; + return true; } bool TextureCache::ConvertTextureCube(uint8_t* dest, @@ -1067,13 +986,13 @@ bool TextureCache::ConvertTextureCube(uint8_t* dest, } bool TextureCache::ConvertTexture(uint8_t* dest, VkBufferImageCopy* copy_region, - const TextureInfo& src) { + uint32_t mip, const TextureInfo& src) { switch (src.dimension) { case Dimension::k1D: assert_always(); break; case Dimension::k2D: - return ConvertTexture2D(dest, copy_region, src); + return ConvertTexture2D(dest, copy_region, mip, src); case Dimension::k3D: assert_always(); break; @@ -1083,6 +1002,145 @@ bool TextureCache::ConvertTexture(uint8_t* dest, VkBufferImageCopy* copy_region, return false; } +bool TextureCache::UploadTexture(VkCommandBuffer command_buffer, + VkFence completion_fence, Texture* dest, + const TextureInfo& src) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + + size_t unpack_length; + if (!ComputeTextureStorage(&unpack_length, src)) { + XELOGW("Failed to compute texture storage"); + return false; + } + + size_t total_unpack_length = unpack_length; + for (uint32_t i = 1; i < src.mip_levels; i++) { + // Add in more space for mips. + total_unpack_length += unpack_length >> (2 * i); + } + + if (!staging_buffer_.CanAcquire(total_unpack_length)) { + // Need to have unique memory for every upload for at least one frame. If we + // run out of memory, we need to flush all queued upload commands to the + // GPU. + FlushPendingCommands(command_buffer, completion_fence); + + // Uploads have been flushed. Continue. + if (!staging_buffer_.CanAcquire(total_unpack_length)) { + // The staging buffer isn't big enough to hold this texture. + XELOGE( + "TextureCache staging buffer is too small! (uploading 0x%.8X bytes)", + total_unpack_length); + assert_always(); + return false; + } + } + + // Grab some temporary memory for staging. + auto alloc = staging_buffer_.Acquire(total_unpack_length, completion_fence); + assert_not_null(alloc); + if (!alloc) { + XELOGE("%s: Failed to acquire staging memory", __func__); + return false; + } + + // DEBUG: Check the source address. If it's completely zero'd out, print it. + bool valid = false; + auto src_data = memory_->TranslatePhysical(src.guest_address); + for (uint32_t i = 0; i < src.input_length; i++) { + if (src_data[i] != 0) { + valid = true; + break; + } + } + + if (!valid) { + XELOGW( + "Warning: Uploading blank texture at address 0x%.8X " + "(length: 0x%.8X, format: %s)", + src.guest_address, src.input_length, src.format_info()->name); + } + + // Upload texture into GPU memory. + // TODO: If the GPU supports it, we can submit a compute batch to convert the + // texture and copy it to its destination. Otherwise, fallback to conversion + // on the CPU. + std::vector copy_regions(src.mip_levels); + + // Base MIP + if (!ConvertTexture(reinterpret_cast(alloc->host_ptr), + ©_regions[0], 0, src)) { + XELOGW("Failed to convert texture"); + return false; + } + copy_regions[0].bufferOffset = alloc->offset; + copy_regions[0].imageOffset = {0, 0, 0}; + + // Now upload all the MIPs + VkDeviceSize buffer_offset = unpack_length; + for (uint32_t mip = 1; mip < src.mip_levels; mip++) { + uint8_t* dest = reinterpret_cast(alloc->host_ptr) + buffer_offset; + ConvertTexture(dest, ©_regions[mip], mip, src); + copy_regions[mip].bufferOffset = alloc->offset + buffer_offset; + copy_regions[mip].imageOffset = {0, 0, 0}; + + // With each mip, the length is divided by 4. + buffer_offset += unpack_length >> (2 * mip); + } + + // Transition the texture into a transfer destination layout. + VkImageMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.pNext = nullptr; + barrier.srcAccessMask = 0; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.oldLayout = dest->image_layout; + barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = dest->image; + barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, src.mip_levels, + copy_regions[0].imageSubresource.baseArrayLayer, + copy_regions[0].imageSubresource.layerCount}; + if (dest->format == VK_FORMAT_D16_UNORM_S8_UINT || + dest->format == VK_FORMAT_D24_UNORM_S8_UINT || + dest->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + barrier.subresourceRange.aspectMask = + VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + } + + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, + nullptr, 1, &barrier); + + // Now move the converted texture into the destination. + if (dest->format == VK_FORMAT_D16_UNORM_S8_UINT || + dest->format == VK_FORMAT_D24_UNORM_S8_UINT || + dest->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + // Do just a depth upload (for now). + // This assumes depth buffers don't have mips (hopefully they don't) + copy_regions[0].imageSubresource.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; + } + vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(), + dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + src.mip_levels, copy_regions.data()); + + // Now transition the texture into a shader readonly source. + barrier.srcAccessMask = barrier.dstAccessMask; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.oldLayout = barrier.newLayout; + barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + 0, 0, nullptr, 0, nullptr, 1, &barrier); + + dest->image_layout = barrier.newLayout; + return true; +} + bool TextureCache::ComputeTextureStorage(size_t* output_length, const TextureInfo& src) { if (src.texture_format == TextureFormat::k_CTX1) { @@ -1182,120 +1240,6 @@ void TextureCache::WritebackTexture(Texture* texture) { wb_staging_buffer_.Scavenge(); } -bool TextureCache::UploadTexture(VkCommandBuffer command_buffer, - VkFence completion_fence, Texture* dest, - const TextureInfo& src) { -#if FINE_GRAINED_DRAW_SCOPES - SCOPE_profile_cpu_f("gpu"); -#endif // FINE_GRAINED_DRAW_SCOPES - - size_t unpack_length; - if (!ComputeTextureStorage(&unpack_length, src)) { - XELOGW("Failed to compute texture storage"); - return false; - } - - if (!staging_buffer_.CanAcquire(unpack_length)) { - // Need to have unique memory for every upload for at least one frame. If we - // run out of memory, we need to flush all queued upload commands to the - // GPU. - FlushPendingCommands(command_buffer, completion_fence); - - // Uploads have been flushed. Continue. - if (!staging_buffer_.CanAcquire(unpack_length)) { - // The staging buffer isn't big enough to hold this texture. - XELOGE( - "TextureCache staging buffer is too small! (uploading 0x%.8X bytes)", - unpack_length); - assert_always(); - return false; - } - } - - // Grab some temporary memory for staging. - auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence); - assert_not_null(alloc); - - // DEBUG: Check the source address. If it's completely zero'd out, print it. - bool valid = false; - auto src_data = memory_->TranslatePhysical(src.guest_address); - for (uint32_t i = 0; i < src.input_length; i++) { - if (src_data[i] != 0) { - valid = true; - break; - } - } - - if (!valid) { - XELOGW( - "Warning: Uploading blank texture at address 0x%.8X " - "(length: 0x%.8X, format: %d)", - src.guest_address, src.input_length, src.texture_format); - } - - // Upload texture into GPU memory. - // TODO: If the GPU supports it, we can submit a compute batch to convert the - // texture and copy it to its destination. Otherwise, fallback to conversion - // on the CPU. - VkBufferImageCopy copy_region; - if (!ConvertTexture(reinterpret_cast(alloc->host_ptr), ©_region, - src)) { - XELOGW("Failed to convert texture"); - return false; - } - - // Transition the texture into a transfer destination layout. - VkImageMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.oldLayout = dest->image_layout; - barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.image = dest->image; - barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, - copy_region.imageSubresource.baseArrayLayer, - copy_region.imageSubresource.layerCount}; - if (dest->format == VK_FORMAT_D16_UNORM_S8_UINT || - dest->format == VK_FORMAT_D24_UNORM_S8_UINT || - dest->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { - barrier.subresourceRange.aspectMask = - VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; - } - - vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, - nullptr, 1, &barrier); - - // Now move the converted texture into the destination. - copy_region.bufferOffset = alloc->offset; - copy_region.imageOffset = {0, 0, 0}; - if (dest->format == VK_FORMAT_D16_UNORM_S8_UINT || - dest->format == VK_FORMAT_D24_UNORM_S8_UINT || - dest->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { - // Do just a depth upload (for now). - copy_region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; - } - vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(), - dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, - ©_region); - - // Now transition the texture into a shader readonly source. - barrier.srcAccessMask = barrier.dstAccessMask; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - barrier.oldLayout = barrier.newLayout; - barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - 0, 0, nullptr, 0, nullptr, 1, &barrier); - - dest->image_layout = barrier.newLayout; - return true; -} - void TextureCache::HashTextureBindings( XXH64_state_t* hash_state, uint32_t& fetch_mask, const std::vector& bindings) { diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h index 6be4a6660..ab529333d 100644 --- a/src/xenia/gpu/vulkan/texture_cache.h +++ b/src/xenia/gpu/vulkan/texture_cache.h @@ -149,15 +149,12 @@ class TextureCache { void FlushPendingCommands(VkCommandBuffer command_buffer, VkFence completion_fence); - static void ConvertTexelCTX1(uint8_t* dest, size_t dest_pitch, - const uint8_t* src, Endian src_endianness); - bool ConvertTexture2D(uint8_t* dest, VkBufferImageCopy* copy_region, - const TextureInfo& src); - bool ConvertTextureCube(uint8_t* dest, VkBufferImageCopy* copy_region, + uint32_t mip, const TextureInfo& src); + bool ConvertTextureCube(uint8_t* dest, VkBufferImageCopy* copy_regions, const TextureInfo& src); bool ConvertTexture(uint8_t* dest, VkBufferImageCopy* copy_region, - const TextureInfo& src); + uint32_t mip, const TextureInfo& src); bool ComputeTextureStorage(size_t* output_length, const TextureInfo& src); // Writes a texture back into guest memory. This call is (mostly) asynchronous