[Vulkan] Initial support for mipmaps

2018-05-03 10:02:07 -05:00 · 2018-05-03 10:02:07 -05:00 · 1f157f35f4
parent f7c7cc54ed
commit 1f157f35f4
6 changed files with 401 additions and 272 deletions
--- a/src/xenia/gpu/sampler_info.cc
+++ b/src/xenia/gpu/sampler_info.cc
@ -44,6 +44,8 @@ bool SamplerInfo::Prepare(const xenos::xe_gpu_texture_fetch_t& fetch,

  out_info->border_color = static_cast<BorderColor>(fetch.border_color);
  out_info->lod_bias = (fetch.lod_bias) / 32.f;
+  out_info->mip_min_level = fetch.mip_min_level;
+  out_info->mip_max_level = fetch.mip_max_level;

  return true;
 }
--- a/src/xenia/gpu/sampler_info.h
+++ b/src/xenia/gpu/sampler_info.h
@ -26,6 +26,8 @@ struct SamplerInfo {
  AnisoFilter aniso_filter;
  BorderColor border_color;
  float lod_bias;
+  uint32_t mip_min_level;
+  uint32_t mip_max_level;

  static bool Prepare(const xenos::xe_gpu_texture_fetch_t& fetch,
                      const ParsedTextureFetchInstruction& fetch_instr,
@ -36,7 +38,9 @@ struct SamplerInfo {
    return min_filter == other.min_filter && mag_filter == other.mag_filter &&
           mip_filter == other.mip_filter && clamp_u == other.clamp_u &&
           clamp_v == other.clamp_v && clamp_w == other.clamp_w &&
-           aniso_filter == other.aniso_filter;
+           aniso_filter == other.aniso_filter && lod_bias == other.lod_bias &&
+           mip_min_level == other.mip_min_level &&
+           mip_max_level == other.mip_max_level;
  }
 };

--- a/src/xenia/gpu/texture_info.cc
+++ b/src/xenia/gpu/texture_info.cc
@ -15,6 +15,7 @@

 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
+#include "xenia/base/memory.h"

 #include "third_party/xxhash/xxhash.h"

@ -59,6 +60,8 @@ bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch,
  info.endianness = static_cast<Endian>(fetch.endianness);
  info.is_tiled = fetch.tiled;
  info.has_packed_mips = fetch.packed_mips;
+  info.mip_address = fetch.mip_address << 12;
+  info.mip_levels = fetch.packed_mips ? fetch.mip_max_level + 1 : 1;
  info.input_length = 0;  // Populated below.

  if (info.format_info()->format == TextureFormat::kUnknown) {
@ -78,6 +81,7 @@ bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch,
    } break;
    case Dimension::k3D: {
      // TODO(benvanik): calculate size.
+      assert_always();
      return false;
    }
    case Dimension::kCube: {
@ -106,6 +110,8 @@ bool TextureInfo::PrepareResolve(uint32_t physical_address,
  info.endianness = endian;
  info.is_tiled = true;
  info.has_packed_mips = false;
+  info.mip_address = 0;
+  info.mip_levels = 1;
  info.input_length = 0;

  if (info.format_info()->format == TextureFormat::kUnknown) {
@ -121,10 +127,6 @@ void TextureInfo::CalculateTextureSizes2D(uint32_t width, uint32_t height) {
  size_2d.logical_width = width;
  size_2d.logical_height = height;

-  // Here be dragons. The values here are used in texture_cache.cc to copy
-  // images and create GL textures. Changes here will impact that code.
-  // TODO(benvanik): generic texture copying utility.
-
  auto format = format_info();

  // w/h in blocks.
@ -135,11 +137,15 @@ void TextureInfo::CalculateTextureSizes2D(uint32_t width, uint32_t height) {
      xe::round_up(size_2d.logical_height, format->block_height) /
      format->block_height;

-  // Tiles are 32x32 blocks. The pitch of all textures must a multiple of tile
-  // dimensions.
-  uint32_t tile_width = xe::round_up(block_width, 32) / 32;
-  size_2d.block_width = tile_width * 32;
-  size_2d.block_height = block_height;
+  if (is_tiled) {
+    // If the texture is tiled, its dimensions must be a multiple of tile
+    // dimensions (32x32 blocks).
+    size_2d.block_width = xe::round_up(block_width, 32);
+    size_2d.block_height = xe::round_up(block_height, 32);
+  } else {
+    size_2d.block_width = block_width;
+    size_2d.block_height = block_height;
+  }

  uint32_t bytes_per_block =
      format->block_width * format->block_height * format->bits_per_pixel / 8;
@ -177,11 +183,15 @@ void TextureInfo::CalculateTextureSizesCube(uint32_t width, uint32_t height,
      xe::round_up(size_cube.logical_height, format->block_height) /
      format->block_height;

-  // Tiles are 32x32 blocks. All textures must be multiples of tile dimensions.
-  uint32_t tile_width = xe::round_up(block_width, 32) / 32;
-  uint32_t tile_height = xe::round_up(block_height, 32) / 32;
-  size_cube.block_width = tile_width * 32;
-  size_cube.block_height = tile_height * 32;
+  if (is_tiled) {
+    // If the texture is tiled, its dimensions must be a multiple of tile
+    // dimensions (32x32 blocks).
+    size_cube.block_width = xe::round_up(block_width, 32);
+    size_cube.block_height = xe::round_up(block_height, 32);
+  } else {
+    size_cube.block_width = block_width;
+    size_cube.block_height = block_height;
+  }

  uint32_t bytes_per_block =
      format->block_width * format->block_height * format->bits_per_pixel / 8;
@ -204,12 +214,154 @@ void TextureInfo::CalculateTextureSizesCube(uint32_t width, uint32_t height,
  input_length = size_cube.input_face_length * 6;
 }

+static void TextureSwap(Endian endianness, void* dest, const void* src,
+                        size_t length) {
+  switch (endianness) {
+    case Endian::k8in16:
+      xe::copy_and_swap_16_unaligned(dest, src, length / 2);
+      break;
+    case Endian::k8in32:
+      xe::copy_and_swap_32_unaligned(dest, src, length / 4);
+      break;
+    case Endian::k16in32:  // Swap high and low 16 bits within a 32 bit word
+      xe::copy_and_swap_16_in_32_unaligned(dest, src, length);
+      break;
+    default:
+    case Endian::kUnspecified:
+      std::memcpy(dest, src, length);
+      break;
+  }
+}
+
+static void ConvertTexelCTX1(uint8_t* dest, size_t dest_pitch,
+                             const uint8_t* src, Endian src_endianness) {
+  // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
+  union {
+    uint8_t data[8];
+    struct {
+      uint8_t r0, g0, r1, g1;
+      uint32_t xx;
+    };
+  } block;
+  static_assert(sizeof(block) == 8, "CTX1 block mismatch");
+
+  const uint32_t bytes_per_block = 8;
+  TextureSwap(src_endianness, block.data, src, bytes_per_block);
+
+  uint8_t cr[4] = {
+      block.r0, block.r1,
+      static_cast<uint8_t>(2.f / 3.f * block.r0 + 1.f / 3.f * block.r1),
+      static_cast<uint8_t>(1.f / 3.f * block.r0 + 2.f / 3.f * block.r1)};
+  uint8_t cg[4] = {
+      block.g0, block.g1,
+      static_cast<uint8_t>(2.f / 3.f * block.g0 + 1.f / 3.f * block.g1),
+      static_cast<uint8_t>(1.f / 3.f * block.g0 + 2.f / 3.f * block.g1)};
+
+  for (uint32_t oy = 0; oy < 4; ++oy) {
+    for (uint32_t ox = 0; ox < 4; ++ox) {
+      uint8_t xx = (block.xx >> (((ox + (oy * 4)) * 2))) & 3;
+      dest[(oy * dest_pitch) + (ox * 2) + 0] = cr[xx];
+      dest[(oy * dest_pitch) + (ox * 2) + 1] = cg[xx];
+    }
+  }
+}
+
+void TextureInfo::ConvertTiled(uint8_t* dest, const uint8_t* src, Endian endian,
+                               const FormatInfo* format_info, uint32_t offset_x,
+                               uint32_t offset_y, uint32_t block_pitch,
+                               uint32_t width, uint32_t height,
+                               uint32_t output_width) {
+  // TODO(benvanik): optimize this inner loop (or work by tiles).
+  uint32_t bytes_per_block = format_info->block_width *
+                             format_info->block_height *
+                             format_info->bits_per_pixel / 8;
+
+  uint32_t output_pitch =
+      output_width * format_info->block_width * format_info->bits_per_pixel / 8;
+
+  uint32_t output_row_height = 1;
+  if (format_info->format == TextureFormat::k_CTX1) {
+    // TODO: Can we calculate this?
+    output_row_height = 4;
+  }
+
+  // logical w/h in blocks.
+  uint32_t block_width =
+      xe::round_up(width, format_info->block_width) / format_info->block_width;
+  uint32_t block_height = xe::round_up(height, format_info->block_height) /
+                          format_info->block_height;
+
+  // Bytes per pixel
+  auto log2_bpp =
+      (bytes_per_block / 4) + ((bytes_per_block / 2) >> (bytes_per_block / 4));
+
+  // Offset to the current row, in bytes.
+  uint32_t output_row_offset = 0;
+  for (uint32_t y = 0; y < block_height; y++) {
+    auto input_row_offset =
+        TextureInfo::TiledOffset2DOuter(offset_y + y, block_pitch, log2_bpp);
+
+    // Go block-by-block on this row.
+    uint32_t output_offset = output_row_offset;
+    for (uint32_t x = 0; x < block_width; x++) {
+      auto input_offset = TextureInfo::TiledOffset2DInner(
+          offset_x + x, offset_y + y, log2_bpp, input_row_offset);
+      input_offset >>= log2_bpp;
+
+      if (format_info->format == TextureFormat::k_CTX1) {
+        // Convert to R8G8.
+        ConvertTexelCTX1(&dest[output_offset], output_pitch, src, endian);
+      } else {
+        // Generic swap to destination.
+        TextureSwap(endian, dest + output_offset,
+                    src + input_offset * bytes_per_block, bytes_per_block);
+      }
+
+      output_offset += bytes_per_block;
+    }
+
+    output_row_offset += output_pitch * output_row_height;
+  }
+}
+
 uint32_t TextureInfo::GetMaxMipLevels(uint32_t width, uint32_t height,
                                      uint32_t depth) {
  return 1 + xe::log2_floor(std::max({width, height, depth}));
 }

-bool TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info,
+uint32_t TextureInfo::GetMipLocation(const TextureInfo& src, uint32_t mip,
+                                     uint32_t* offset_x, uint32_t* offset_y) {
+  if (mip == 0) {
+    // Short-circuit. Mip 0 is always stored in guest_address.
+    GetPackedTileOffset(src, offset_x, offset_y);
+    return src.guest_address;
+  }
+
+  // Walk forward to find the address of the mip
+  // If the texture is <= 16 pixels w/h, the mips are packed with the base
+  // texture. Otherwise, they're stored beginning from mip_address.
+  uint32_t address_base = std::min(src.width, src.height) < 16
+                              ? src.guest_address
+                              : src.mip_address;
+  uint32_t address_offset = 0;
+
+  for (uint32_t i = 1; i < mip; i++) {
+    uint32_t logical_width = std::max((src.width + 1) >> mip, 1u);
+    uint32_t logical_height = std::max((src.height + 1) >> mip, 1u);
+    if (std::min(logical_width, logical_height) <= 16) {
+      // We've reached the point where the mips are packed into a single tile.
+      // TODO(DrChat): Figure out how to calculate the packed tile offset.
+      continue;
+    }
+
+    address_offset += src.input_length >> (i * 2);
+  }
+
+  return address_base + address_offset;
+}
+
+bool TextureInfo::GetPackedTileOffset(uint32_t width, uint32_t height,
+                                      const FormatInfo* format_info,
                                      uint32_t* out_offset_x,
                                      uint32_t* out_offset_y) {
  // Tile size is 32x32, and once textures go <=16 they are packed into a
@ -231,6 +383,13 @@ bool TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info,
  // This only works for square textures, or textures that are some non-pot
  // <= square. As soon as the aspect ratio goes weird, the textures start to
  // stretch across tiles.
+  //
+  // The 2x2 and 1x1 squares are packed in their specific positions because
+  // each square is the size of at least one block (which is 4x4 pixels max)
+  // 4x4: x = 4
+  // 2x2: y = (x & 0x3) << 2
+  // 1x1: y = (x & 0x3) << 2
+  //
  // if (tile_aligned(w) > tile_aligned(h)) {
  //   // wider than tall, so packed horizontally
  // } else if (tile_aligned(w) < tile_aligned(h)) {
@ -243,16 +402,14 @@ bool TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info,
  // The minimum dimension is what matters most: if either width or height
  // is <= 16 this mode kicks in.

-  if (std::min(texture_info.size_2d.logical_width,
-               texture_info.size_2d.logical_height) > 16) {
+  if (std::min(width, height) > 16) {
    // Too big, not packed.
    *out_offset_x = 0;
    *out_offset_y = 0;
    return false;
  }

-  if (xe::log2_ceil(texture_info.size_2d.logical_width) >
-      xe::log2_ceil(texture_info.size_2d.logical_height)) {
+  if (xe::log2_ceil(width) > xe::log2_ceil(height)) {
    // Wider than tall. Laid out vertically.
    *out_offset_x = 0;
    *out_offset_y = 16;
@ -261,26 +418,37 @@ bool TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info,
    *out_offset_x = 16;
    *out_offset_y = 0;
  }
-  *out_offset_x /= texture_info.format_info()->block_width;
-  *out_offset_y /= texture_info.format_info()->block_height;
+
+  *out_offset_x /= format_info->block_width;
+  *out_offset_y /= format_info->block_height;
  return true;
 }

+bool TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info,
+                                      uint32_t* out_offset_x,
+                                      uint32_t* out_offset_y) {
+  return GetPackedTileOffset(
+      texture_info.size_2d.logical_width, texture_info.size_2d.logical_height,
+      texture_info.format_info(), out_offset_x, out_offset_y);
+}
+
 // https://github.com/BinomialLLC/crunch/blob/ea9b8d8c00c8329791256adafa8cf11e4e7942a2/inc/crn_decomp.h#L4108
 uint32_t TextureInfo::TiledOffset2DOuter(uint32_t y, uint32_t width,
-                                         uint32_t log_bpp) {
-  uint32_t macro = ((y >> 5) * (width >> 5)) << (log_bpp + 7);
-  uint32_t micro = ((y & 6) << 2) << log_bpp;
-  return macro + ((micro & ~15) << 1) + (micro & 15) +
-         ((y & 8) << (3 + log_bpp)) + ((y & 1) << 4);
+                                         uint32_t log2_bpp) {
+  uint32_t macro = ((y / 32) * (width / 32)) << (log2_bpp + 7);
+  uint32_t micro = ((y & 6) << 2) << log2_bpp;
+  return macro + ((micro & ~0xF) << 1) + (micro & 0xF) +
+         ((y & 8) << (3 + log2_bpp)) + ((y & 1) << 4);
 }

-uint32_t TextureInfo::TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp,
+uint32_t TextureInfo::TiledOffset2DInner(uint32_t x, uint32_t y,
+                                         uint32_t log2_bpp,
                                         uint32_t base_offset) {
-  uint32_t macro = (x >> 5) << (bpp + 7);
-  uint32_t micro = (x & 7) << bpp;
-  uint32_t offset = base_offset + (macro + ((micro & ~15) << 1) + (micro & 15));
-  return ((offset & ~511) << 3) + ((offset & 448) << 2) + (offset & 63) +
+  uint32_t macro = (x / 32) << (log2_bpp + 7);
+  uint32_t micro = (x & 7) << log2_bpp;
+  uint32_t offset =
+      base_offset + (macro + ((micro & ~0xF) << 1) + (micro & 0xF));
+  return ((offset & ~0x1FF) << 3) + ((offset & 0x1C0) << 2) + (offset & 0x3F) +
         ((y & 16) << 7) + (((((y & 8) >> 2) + (x >> 3)) & 3) << 6);
 }

--- a/src/xenia/gpu/texture_info.h
+++ b/src/xenia/gpu/texture_info.h
@ -256,6 +256,8 @@ struct TextureInfo {
  Endian endianness;
  bool is_tiled;
  bool has_packed_mips;
+  uint32_t mip_address;
+  uint32_t mip_levels;
  uint32_t input_length;

  const FormatInfo* format_info() const {
@ -304,14 +306,26 @@ struct TextureInfo {
                             uint32_t width, uint32_t height,
                             TextureInfo* out_info);

+  static void ConvertTiled(uint8_t* dest, const uint8_t* src, Endian endian,
+                           const FormatInfo* format_info, uint32_t offset_x,
+                           uint32_t offset_y, uint32_t block_pitch,
+                           uint32_t width, uint32_t height,
+                           uint32_t output_width);
+
  static uint32_t GetMaxMipLevels(uint32_t width, uint32_t height,
                                  uint32_t depth);
+  static uint32_t GetMipLocation(const TextureInfo& src, uint32_t mip,
+                                 uint32_t* offset_x, uint32_t* offset_y);
+  static bool GetPackedTileOffset(uint32_t width, uint32_t height,
+                                  const FormatInfo* format_info,
+                                  uint32_t* out_offset_x,
+                                  uint32_t* out_offset_y);
  static bool GetPackedTileOffset(const TextureInfo& texture_info,
                                  uint32_t* out_offset_x,
                                  uint32_t* out_offset_y);
  static uint32_t TiledOffset2DOuter(uint32_t y, uint32_t width,
-                                     uint32_t log_bpp);
-  static uint32_t TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp,
+                                     uint32_t log2_bpp);
+  static uint32_t TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t log2_bpp,
                                     uint32_t base_offset);

  uint64_t hash() const;
--- a/src/xenia/gpu/vulkan/texture_cache.cc
+++ b/src/xenia/gpu/vulkan/texture_cache.cc
@ -258,7 +258,13 @@ TextureCache::Texture* TextureCache::AllocateTexture(
  assert_not_null(texture_info.format_info());
  auto& config = texture_configs[int(texture_info.format_info()->format)];
  VkFormat format = config.host_format;
-  assert(format != VK_FORMAT_UNDEFINED);
+  if (format == VK_FORMAT_UNDEFINED) {
+    XELOGE(
+        "Texture Cache: Attempted to allocate texture format %s, which is "
+        "defined as VK_FORMAT_UNDEFINED!",
+        texture_info.format_info()->name);
+    return nullptr;
+  }

  image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
  image_info.usage =
@ -302,7 +308,7 @@ TextureCache::Texture* TextureCache::AllocateTexture(

  image_info.format = format;
  image_info.extent = {texture_info.width + 1, texture_info.height + 1, 1};
-  image_info.mipLevels = 1;
+  image_info.mipLevels = texture_info.mip_levels;
  image_info.arrayLayers = texture_info.depth + 1;
  image_info.samples = VK_SAMPLE_COUNT_1_BIT;
  image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
@ -664,8 +670,6 @@ TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) {
  sampler_create_info.addressModeW =
      address_mode_map[static_cast<int>(sampler_info.clamp_w)];

-  sampler_create_info.mipLodBias = sampler_info.lod_bias;
-
  float aniso = 0.f;
  switch (sampler_info.aniso_filter) {
    case AnisoFilter::kDisabled:
@ -697,8 +701,9 @@ TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) {

  sampler_create_info.compareEnable = VK_FALSE;
  sampler_create_info.compareOp = VK_COMPARE_OP_NEVER;
-  sampler_create_info.minLod = 0.0f;
-  sampler_create_info.maxLod = 0.0f;
+  sampler_create_info.mipLodBias = sampler_info.lod_bias;
+  sampler_create_info.minLod = float(sampler_info.mip_min_level);
+  sampler_create_info.maxLod = float(sampler_info.mip_max_level);
  sampler_create_info.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK;
  sampler_create_info.unnormalizedCoordinates = VK_FALSE;
  VkSampler vk_sampler;
@ -816,13 +821,13 @@ void TextureSwap(Endian endianness, void* dest, const void* src,
                 size_t length) {
  switch (endianness) {
    case Endian::k8in16:
-      xe::copy_and_swap_16_aligned(dest, src, length / 2);
+      xe::copy_and_swap_16_unaligned(dest, src, length / 2);
      break;
    case Endian::k8in32:
-      xe::copy_and_swap_32_aligned(dest, src, length / 4);
+      xe::copy_and_swap_32_unaligned(dest, src, length / 4);
      break;
    case Endian::k16in32:  // Swap high and low 16 bits within a 32 bit word
-      xe::copy_and_swap_16_in_32_aligned(dest, src, length);
+      xe::copy_and_swap_16_in_32_unaligned(dest, src, length);
      break;
    default:
    case Endian::kUnspecified:
@ -867,43 +872,21 @@ void TextureCache::FlushPendingCommands(VkCommandBuffer command_buffer,
  vkBeginCommandBuffer(command_buffer, &begin_info);
 }

-void TextureCache::ConvertTexelCTX1(uint8_t* dest, size_t dest_pitch,
-                                    const uint8_t* src, Endian src_endianness) {
-  // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
-  union {
-    uint8_t data[8];
-    struct {
-      uint8_t r0, g0, r1, g1;
-      uint32_t xx;
-    };
-  } block;
-  static_assert(sizeof(block) == 8, "CTX1 block mismatch");
-
-  const uint32_t bytes_per_block = 8;
-  TextureSwap(src_endianness, block.data, src, bytes_per_block);
-
-  uint8_t cr[4] = {
-      block.r0, block.r1,
-      static_cast<uint8_t>(2.f / 3.f * block.r0 + 1.f / 3.f * block.r1),
-      static_cast<uint8_t>(1.f / 3.f * block.r0 + 2.f / 3.f * block.r1)};
-  uint8_t cg[4] = {
-      block.g0, block.g1,
-      static_cast<uint8_t>(2.f / 3.f * block.g0 + 1.f / 3.f * block.g1),
-      static_cast<uint8_t>(1.f / 3.f * block.g0 + 2.f / 3.f * block.g1)};
-
-  for (uint32_t oy = 0; oy < 4; ++oy) {
-    for (uint32_t ox = 0; ox < 4; ++ox) {
-      uint8_t xx = (block.xx >> (((ox + (oy * 4)) * 2))) & 3;
-      dest[(oy * dest_pitch) + (ox * 2) + 0] = cr[xx];
-      dest[(oy * dest_pitch) + (ox * 2) + 1] = cg[xx];
-    }
-  }
-}
-
 bool TextureCache::ConvertTexture2D(uint8_t* dest,
                                    VkBufferImageCopy* copy_region,
-                                    const TextureInfo& src) {
-  void* host_address = memory_->TranslatePhysical(src.guest_address);
+                                    uint32_t mip, const TextureInfo& src) {
+  uint32_t offset_x = 0;
+  uint32_t offset_y = 0;
+  uint32_t address =
+      TextureInfo::GetMipLocation(src, mip, &offset_x, &offset_y);
+  void* host_address = memory_->TranslatePhysical(address);
+
+  uint32_t logical_width = src.size_2d.logical_width >> mip;
+  uint32_t logical_height = src.size_2d.logical_height >> mip;
+  uint32_t block_width = src.size_2d.block_width >> mip;
+  uint32_t input_width = src.size_2d.input_width >> mip;
+  uint32_t input_height = src.size_2d.input_height >> mip;
+
  if (!src.is_tiled) {
    uint32_t offset_x, offset_y;
    if (src.has_packed_mips &&
@ -922,89 +905,25 @@ bool TextureCache::ConvertTexture2D(uint8_t* dest,
        src_mem += src.size_2d.input_pitch;
        dest += src.size_2d.input_pitch;
      }
-      copy_region->bufferRowLength = src.size_2d.input_width;
-      copy_region->bufferImageHeight = src.size_2d.input_height;
-      copy_region->imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
-      copy_region->imageExtent = {src.size_2d.logical_width,
-                                  src.size_2d.logical_height, 1};
-      return true;
    } else {
      // Fast path copy entire image.
      TextureSwap(src.endianness, dest, host_address, src.input_length);
-      copy_region->bufferRowLength = src.size_2d.input_width;
-      copy_region->bufferImageHeight = src.size_2d.input_height;
-      copy_region->imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
-      copy_region->imageExtent = {src.size_2d.logical_width,
-                                  src.size_2d.logical_height, 1};
-      return true;
    }
  } else {
    // Untile image.
    // We could do this in a shader to speed things up, as this is pretty
    // slow.
-
-    // TODO(benvanik): optimize this inner loop (or work by tiles).
    const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
-    uint32_t bytes_per_block = src.format_info()->block_width *
-                               src.format_info()->block_height *
-                               src.format_info()->bits_per_pixel / 8;
-
-    uint32_t output_pitch = src.size_2d.input_width *
-                            src.format_info()->block_width *
-                            src.format_info()->bits_per_pixel / 8;
-
-    uint32_t output_row_height = 1;
-    if (src.texture_format == TextureFormat::k_CTX1) {
-      // TODO: Can we calculate this?
-      output_row_height = 4;
-    }
-
-    // Tiled textures can be packed; get the offset into the packed texture.
-    uint32_t offset_x;
-    uint32_t offset_y;
-    TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
-    auto log2_bpp = (bytes_per_block >> 2) +
-                    ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
-
-    // Offset to the current row, in bytes.
-    uint32_t output_row_offset = 0;
-    for (uint32_t y = 0; y < src.size_2d.block_height; y++) {
-      auto input_row_offset = TextureInfo::TiledOffset2DOuter(
-          offset_y + y, src.size_2d.block_width, log2_bpp);
-
-      // Go block-by-block on this row.
-      uint32_t output_offset = output_row_offset;
-      for (uint32_t x = 0; x < src.size_2d.block_width; x++) {
-        auto input_offset = TextureInfo::TiledOffset2DInner(
-            offset_x + x, offset_y + y, log2_bpp, input_row_offset);
-        input_offset >>= log2_bpp;
-
-        if (src.texture_format == TextureFormat::k_CTX1) {
-          // Convert to R8G8.
-          ConvertTexelCTX1(&dest[output_offset], output_pitch, src_mem,
-                           src.endianness);
-        } else {
-          // Generic swap to destination.
-          TextureSwap(src.endianness, dest + output_offset,
-                      src_mem + input_offset * bytes_per_block,
-                      bytes_per_block);
-        }
-
-        output_offset += bytes_per_block;
-      }
-
-      output_row_offset += output_pitch * output_row_height;
-    }
-
-    copy_region->bufferRowLength = src.size_2d.input_width;
-    copy_region->bufferImageHeight = src.size_2d.input_height;
-    copy_region->imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
-    copy_region->imageExtent = {src.size_2d.logical_width,
-                                src.size_2d.logical_height, 1};
-    return true;
+    TextureInfo::ConvertTiled(dest, src_mem, src.endianness, src.format_info(),
+                              offset_x, offset_y, block_width, logical_width,
+                              logical_height, input_width);
  }

-  return false;
+  copy_region->bufferRowLength = input_width;
+  copy_region->bufferImageHeight = input_height;
+  copy_region->imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, mip, 0, 1};
+  copy_region->imageExtent = {logical_width, logical_height, 1};
+  return true;
 }

 bool TextureCache::ConvertTextureCube(uint8_t* dest,
@ -1067,13 +986,13 @@ bool TextureCache::ConvertTextureCube(uint8_t* dest,
 }

 bool TextureCache::ConvertTexture(uint8_t* dest, VkBufferImageCopy* copy_region,
-                                  const TextureInfo& src) {
+                                  uint32_t mip, const TextureInfo& src) {
  switch (src.dimension) {
    case Dimension::k1D:
      assert_always();
      break;
    case Dimension::k2D:
-      return ConvertTexture2D(dest, copy_region, src);
+      return ConvertTexture2D(dest, copy_region, mip, src);
    case Dimension::k3D:
      assert_always();
      break;
@ -1083,6 +1002,145 @@ bool TextureCache::ConvertTexture(uint8_t* dest, VkBufferImageCopy* copy_region,
  return false;
 }

+bool TextureCache::UploadTexture(VkCommandBuffer command_buffer,
+                                 VkFence completion_fence, Texture* dest,
+                                 const TextureInfo& src) {
+#if FINE_GRAINED_DRAW_SCOPES
+  SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
+
+  size_t unpack_length;
+  if (!ComputeTextureStorage(&unpack_length, src)) {
+    XELOGW("Failed to compute texture storage");
+    return false;
+  }
+
+  size_t total_unpack_length = unpack_length;
+  for (uint32_t i = 1; i < src.mip_levels; i++) {
+    // Add in more space for mips.
+    total_unpack_length += unpack_length >> (2 * i);
+  }
+
+  if (!staging_buffer_.CanAcquire(total_unpack_length)) {
+    // Need to have unique memory for every upload for at least one frame. If we
+    // run out of memory, we need to flush all queued upload commands to the
+    // GPU.
+    FlushPendingCommands(command_buffer, completion_fence);
+
+    // Uploads have been flushed. Continue.
+    if (!staging_buffer_.CanAcquire(total_unpack_length)) {
+      // The staging buffer isn't big enough to hold this texture.
+      XELOGE(
+          "TextureCache staging buffer is too small! (uploading 0x%.8X bytes)",
+          total_unpack_length);
+      assert_always();
+      return false;
+    }
+  }
+
+  // Grab some temporary memory for staging.
+  auto alloc = staging_buffer_.Acquire(total_unpack_length, completion_fence);
+  assert_not_null(alloc);
+  if (!alloc) {
+    XELOGE("%s: Failed to acquire staging memory", __func__);
+    return false;
+  }
+
+  // DEBUG: Check the source address. If it's completely zero'd out, print it.
+  bool valid = false;
+  auto src_data = memory_->TranslatePhysical(src.guest_address);
+  for (uint32_t i = 0; i < src.input_length; i++) {
+    if (src_data[i] != 0) {
+      valid = true;
+      break;
+    }
+  }
+
+  if (!valid) {
+    XELOGW(
+        "Warning: Uploading blank texture at address 0x%.8X "
+        "(length: 0x%.8X, format: %s)",
+        src.guest_address, src.input_length, src.format_info()->name);
+  }
+
+  // Upload texture into GPU memory.
+  // TODO: If the GPU supports it, we can submit a compute batch to convert the
+  // texture and copy it to its destination. Otherwise, fallback to conversion
+  // on the CPU.
+  std::vector<VkBufferImageCopy> copy_regions(src.mip_levels);
+
+  // Base MIP
+  if (!ConvertTexture(reinterpret_cast<uint8_t*>(alloc->host_ptr),
+                      &copy_regions[0], 0, src)) {
+    XELOGW("Failed to convert texture");
+    return false;
+  }
+  copy_regions[0].bufferOffset = alloc->offset;
+  copy_regions[0].imageOffset = {0, 0, 0};
+
+  // Now upload all the MIPs
+  VkDeviceSize buffer_offset = unpack_length;
+  for (uint32_t mip = 1; mip < src.mip_levels; mip++) {
+    uint8_t* dest = reinterpret_cast<uint8_t*>(alloc->host_ptr) + buffer_offset;
+    ConvertTexture(dest, &copy_regions[mip], mip, src);
+    copy_regions[mip].bufferOffset = alloc->offset + buffer_offset;
+    copy_regions[mip].imageOffset = {0, 0, 0};
+
+    // With each mip, the length is divided by 4.
+    buffer_offset += unpack_length >> (2 * mip);
+  }
+
+  // Transition the texture into a transfer destination layout.
+  VkImageMemoryBarrier barrier;
+  barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+  barrier.pNext = nullptr;
+  barrier.srcAccessMask = 0;
+  barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+  barrier.oldLayout = dest->image_layout;
+  barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+  barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.image = dest->image;
+  barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, src.mip_levels,
+                              copy_regions[0].imageSubresource.baseArrayLayer,
+                              copy_regions[0].imageSubresource.layerCount};
+  if (dest->format == VK_FORMAT_D16_UNORM_S8_UINT ||
+      dest->format == VK_FORMAT_D24_UNORM_S8_UINT ||
+      dest->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+    barrier.subresourceRange.aspectMask =
+        VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+  }
+
+  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                       VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0,
+                       nullptr, 1, &barrier);
+
+  // Now move the converted texture into the destination.
+  if (dest->format == VK_FORMAT_D16_UNORM_S8_UINT ||
+      dest->format == VK_FORMAT_D24_UNORM_S8_UINT ||
+      dest->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+    // Do just a depth upload (for now).
+    // This assumes depth buffers don't have mips (hopefully they don't)
+    copy_regions[0].imageSubresource.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
+  }
+  vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(),
+                         dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                         src.mip_levels, copy_regions.data());
+
+  // Now transition the texture into a shader readonly source.
+  barrier.srcAccessMask = barrier.dstAccessMask;
+  barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+  barrier.oldLayout = barrier.newLayout;
+  barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                       VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
+                           VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+                       0, 0, nullptr, 0, nullptr, 1, &barrier);
+
+  dest->image_layout = barrier.newLayout;
+  return true;
+}
+
 bool TextureCache::ComputeTextureStorage(size_t* output_length,
                                         const TextureInfo& src) {
  if (src.texture_format == TextureFormat::k_CTX1) {
@ -1182,120 +1240,6 @@ void TextureCache::WritebackTexture(Texture* texture) {
  wb_staging_buffer_.Scavenge();
 }

-bool TextureCache::UploadTexture(VkCommandBuffer command_buffer,
-                                 VkFence completion_fence, Texture* dest,
-                                 const TextureInfo& src) {
-#if FINE_GRAINED_DRAW_SCOPES
-  SCOPE_profile_cpu_f("gpu");
-#endif  // FINE_GRAINED_DRAW_SCOPES
-
-  size_t unpack_length;
-  if (!ComputeTextureStorage(&unpack_length, src)) {
-    XELOGW("Failed to compute texture storage");
-    return false;
-  }
-
-  if (!staging_buffer_.CanAcquire(unpack_length)) {
-    // Need to have unique memory for every upload for at least one frame. If we
-    // run out of memory, we need to flush all queued upload commands to the
-    // GPU.
-    FlushPendingCommands(command_buffer, completion_fence);
-
-    // Uploads have been flushed. Continue.
-    if (!staging_buffer_.CanAcquire(unpack_length)) {
-      // The staging buffer isn't big enough to hold this texture.
-      XELOGE(
-          "TextureCache staging buffer is too small! (uploading 0x%.8X bytes)",
-          unpack_length);
-      assert_always();
-      return false;
-    }
-  }
-
-  // Grab some temporary memory for staging.
-  auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence);
-  assert_not_null(alloc);
-
-  // DEBUG: Check the source address. If it's completely zero'd out, print it.
-  bool valid = false;
-  auto src_data = memory_->TranslatePhysical(src.guest_address);
-  for (uint32_t i = 0; i < src.input_length; i++) {
-    if (src_data[i] != 0) {
-      valid = true;
-      break;
-    }
-  }
-
-  if (!valid) {
-    XELOGW(
-        "Warning: Uploading blank texture at address 0x%.8X "
-        "(length: 0x%.8X, format: %d)",
-        src.guest_address, src.input_length, src.texture_format);
-  }
-
-  // Upload texture into GPU memory.
-  // TODO: If the GPU supports it, we can submit a compute batch to convert the
-  // texture and copy it to its destination. Otherwise, fallback to conversion
-  // on the CPU.
-  VkBufferImageCopy copy_region;
-  if (!ConvertTexture(reinterpret_cast<uint8_t*>(alloc->host_ptr), &copy_region,
-                      src)) {
-    XELOGW("Failed to convert texture");
-    return false;
-  }
-
-  // Transition the texture into a transfer destination layout.
-  VkImageMemoryBarrier barrier;
-  barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
-  barrier.pNext = nullptr;
-  barrier.srcAccessMask = 0;
-  barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-  barrier.oldLayout = dest->image_layout;
-  barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
-  barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-  barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-  barrier.image = dest->image;
-  barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1,
-                              copy_region.imageSubresource.baseArrayLayer,
-                              copy_region.imageSubresource.layerCount};
-  if (dest->format == VK_FORMAT_D16_UNORM_S8_UINT ||
-      dest->format == VK_FORMAT_D24_UNORM_S8_UINT ||
-      dest->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
-    barrier.subresourceRange.aspectMask =
-        VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
-  }
-
-  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-                       VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0,
-                       nullptr, 1, &barrier);
-
-  // Now move the converted texture into the destination.
-  copy_region.bufferOffset = alloc->offset;
-  copy_region.imageOffset = {0, 0, 0};
-  if (dest->format == VK_FORMAT_D16_UNORM_S8_UINT ||
-      dest->format == VK_FORMAT_D24_UNORM_S8_UINT ||
-      dest->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
-    // Do just a depth upload (for now).
-    copy_region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
-  }
-  vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(),
-                         dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1,
-                         &copy_region);
-
-  // Now transition the texture into a shader readonly source.
-  barrier.srcAccessMask = barrier.dstAccessMask;
-  barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
-  barrier.oldLayout = barrier.newLayout;
-  barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
-  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT,
-                       VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
-                           VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
-                       0, 0, nullptr, 0, nullptr, 1, &barrier);
-
-  dest->image_layout = barrier.newLayout;
-  return true;
-}
-
 void TextureCache::HashTextureBindings(
    XXH64_state_t* hash_state, uint32_t& fetch_mask,
    const std::vector<Shader::TextureBinding>& bindings) {
--- a/src/xenia/gpu/vulkan/texture_cache.h
+++ b/src/xenia/gpu/vulkan/texture_cache.h
@ -149,15 +149,12 @@ class TextureCache {
  void FlushPendingCommands(VkCommandBuffer command_buffer,
                            VkFence completion_fence);

-  static void ConvertTexelCTX1(uint8_t* dest, size_t dest_pitch,
-                               const uint8_t* src, Endian src_endianness);
-
  bool ConvertTexture2D(uint8_t* dest, VkBufferImageCopy* copy_region,
-                        const TextureInfo& src);
-  bool ConvertTextureCube(uint8_t* dest, VkBufferImageCopy* copy_region,
+                        uint32_t mip, const TextureInfo& src);
+  bool ConvertTextureCube(uint8_t* dest, VkBufferImageCopy* copy_regions,
                          const TextureInfo& src);
  bool ConvertTexture(uint8_t* dest, VkBufferImageCopy* copy_region,
-                      const TextureInfo& src);
+                      uint32_t mip, const TextureInfo& src);
  bool ComputeTextureStorage(size_t* output_length, const TextureInfo& src);

  // Writes a texture back into guest memory. This call is (mostly) asynchronous