- GPU: Removal of deprecated TextureInfo fields.

- GPU: Replicated removed deprecated fields into local struct for GL. - GPU: Improved texture copies for Vulkan, no more pitch changes. - GPU: Stubbed code for CTX1 conversion in Vulkan.
2017-08-07 19:30:06 -05:00 · 2017-08-07 19:30:06 -05:00 · b4ae5b9a01
parent 906f24cb1a
commit b4ae5b9a01
5 changed files with 344 additions and 415 deletions
--- a/src/xenia/gpu/gl4/texture_cache.cc
+++ b/src/xenia/gpu/gl4/texture_cache.cc
@ -689,6 +689,82 @@ void TextureCache::EvictTexture(TextureEntry* entry) {
  delete entry;
 }
 struct HostTextureInfo {
  uint32_t output_length;
  union {
    struct {
      uint32_t output_width;
      uint32_t output_pitch;
    } size_1d;
    struct {
      uint32_t output_width;
      uint32_t output_height;
      uint32_t output_pitch;
    } size_2d;
    struct {
    } size_3d;
    struct {
      uint32_t output_width;
      uint32_t output_height;
      uint32_t output_pitch;
      uint32_t output_face_length;
    } size_cube;
  };
  static bool Setup(const TextureInfo& guest_info, HostTextureInfo* out_info) {
    auto& info = *out_info;
    auto format = guest_info.format_info();
    uint32_t bytes_per_block = format->block_width * format->bits_per_pixel / 8;
    switch (guest_info.dimension) {
      case Dimension::k1D: {
        uint32_t block_width = xe::round_up(guest_info.size_1d.logical_width,
                                            format->block_width) /
                               format->block_width;
        info.size_1d.output_width = block_width * format->block_width;
        info.size_1d.output_pitch = block_width * bytes_per_block;
        info.output_length = info.size_1d.output_pitch;
        return true;
      }
      case Dimension::k2D: {
        uint32_t block_width = xe::round_up(guest_info.size_2d.logical_width,
                                            format->block_width) /
                               format->block_width;
        uint32_t block_height = xe::round_up(guest_info.size_2d.logical_height,
                                             format->block_height) /
                                format->block_height;
        info.size_2d.output_width = block_width * format->block_width;
        info.size_2d.output_height = block_height * format->block_height;
        info.size_2d.output_pitch = block_width * bytes_per_block;
        info.output_length = info.size_2d.output_pitch * block_height;
        return true;
      };
      case Dimension::k3D: {
        return false;
      }
      case Dimension::kCube: {
        uint32_t block_width = xe::round_up(guest_info.size_cube.logical_width,
                                            format->block_width) /
                               format->block_width;
        uint32_t block_height =
            xe::round_up(guest_info.size_cube.logical_height,
                         format->block_height) /
            format->block_height;
        info.size_cube.output_width = block_width * format->block_width;
        info.size_cube.output_height = block_height * format->block_height;
        info.size_cube.output_pitch = block_width * bytes_per_block;
        info.size_cube.output_face_length =
            info.size_cube.output_pitch * block_height;
        info.output_length = info.size_cube.output_face_length * 6;
        return true;
      }
    }
    return false;
  }
 };
 void TextureSwap(Endian endianness, void* dest, const void* src,
                 size_t length) {
  switch (endianness) {
@ -720,14 +796,20 @@ bool TextureCache::UploadTexture1D(GLuint texture,
    return false;
  }
-  size_t unpack_length = texture_info.output_length;
+  HostTextureInfo host_info;
  if (!HostTextureInfo::Setup(texture_info, &host_info)) {
    assert_always("Failed to set up host texture info");
    return false;
  }
  size_t unpack_length = host_info.output_length;
  glTextureStorage1D(texture, 1, config.internal_format,
-                     texture_info.size_1d.output_width);
+                     host_info.size_1d.output_width);
  auto allocation = scratch_buffer_->Acquire(unpack_length);
  if (!texture_info.is_tiled) {
-    if (texture_info.size_1d.input_pitch == texture_info.size_1d.output_pitch) {
+    if (texture_info.size_1d.input_pitch == host_info.size_1d.output_pitch) {
      TextureSwap(texture_info.endianness, allocation.host_ptr, host_address,
                  unpack_length);
    } else {
@ -744,10 +826,10 @@ bool TextureCache::UploadTexture1D(GLuint texture,
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, scratch_buffer_->handle());
  if (texture_info.is_compressed()) {
-    glCompressedTextureSubImage1D(
+    glCompressedTextureSubImage1D(texture, 0, 0, host_info.size_1d.output_width,
-        texture, 0, 0, texture_info.size_1d.output_width, config.format,
+                                  config.format,
-        static_cast<GLsizei>(unpack_length),
+                                  static_cast<GLsizei>(unpack_length),
-        reinterpret_cast<void*>(unpack_offset));
+                                  reinterpret_cast<void*>(unpack_offset));
  } else {
    // Most of these don't seem to have an effect on compressed images.
    // glPixelStorei(GL_UNPACK_SWAP_BYTES, GL_TRUE);
@ -755,7 +837,7 @@ bool TextureCache::UploadTexture1D(GLuint texture,
    // glPixelStorei(GL_UNPACK_ROW_LENGTH, texture_info.size_2d.input_width);
    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
-    glTextureSubImage1D(texture, 0, 0, texture_info.size_1d.output_width,
+    glTextureSubImage1D(texture, 0, 0, host_info.size_1d.output_width,
                        config.format, config.type,
                        reinterpret_cast<void*>(unpack_offset));
  }
@ -776,10 +858,16 @@ bool TextureCache::UploadTexture2D(GLuint texture,
    return false;
  }
-  size_t unpack_length = texture_info.output_length;
+  HostTextureInfo host_info;
  if (!HostTextureInfo::Setup(texture_info, &host_info)) {
    assert_always("Failed to set up host texture info");
    return false;
  }
  size_t unpack_length = host_info.output_length;
  glTextureStorage2D(texture, 1, config.internal_format,
-                     texture_info.size_2d.output_width,
+                     host_info.size_2d.output_width,
-                     texture_info.size_2d.output_height);
+                     host_info.size_2d.output_height);
  auto allocation = scratch_buffer_->Acquire(unpack_length);
@ -796,16 +884,16 @@ bool TextureCache::UploadTexture2D(GLuint texture,
      src += offset_x * bytes_per_block;
      uint8_t* dest = reinterpret_cast<uint8_t*>(allocation.host_ptr);
      uint32_t pitch = std::min(texture_info.size_2d.input_pitch,
-                                texture_info.size_2d.output_pitch);
+                                host_info.size_2d.output_pitch);
      for (uint32_t y = 0; y < std::min(texture_info.size_2d.block_height,
                                        texture_info.size_2d.logical_height);
           y++) {
        TextureSwap(texture_info.endianness, dest, src, pitch);
        src += texture_info.size_2d.input_pitch;
-        dest += texture_info.size_2d.output_pitch;
+        dest += host_info.size_2d.output_pitch;
      }
    } else if (texture_info.size_2d.input_pitch ==
-               texture_info.size_2d.output_pitch) {
+               host_info.size_2d.output_pitch) {
      // Fast path copy entire image.
      TextureSwap(texture_info.endianness, allocation.host_ptr, host_address,
                  unpack_length);
@ -816,13 +904,13 @@ bool TextureCache::UploadTexture2D(GLuint texture,
      const uint8_t* src = host_address;
      uint8_t* dest = reinterpret_cast<uint8_t*>(allocation.host_ptr);
      uint32_t pitch = std::min(texture_info.size_2d.input_pitch,
-                                texture_info.size_2d.output_pitch);
+                                host_info.size_2d.output_pitch);
      for (uint32_t y = 0; y < std::min(texture_info.size_2d.block_height,
                                        texture_info.size_2d.logical_height);
           y++) {
        TextureSwap(texture_info.endianness, dest, src, pitch);
        src += texture_info.size_2d.input_pitch;
-        dest += texture_info.size_2d.output_pitch;
+        dest += host_info.size_2d.output_pitch;
      }
    }
  } else {
@ -846,7 +934,7 @@ bool TextureCache::UploadTexture2D(GLuint texture,
    for (uint32_t y = 0, output_base_offset = 0;
         y < std::min(texture_info.size_2d.block_height,
                      texture_info.size_2d.logical_height);
-         y++, output_base_offset += texture_info.size_2d.output_pitch) {
+         y++, output_base_offset += host_info.size_2d.output_pitch) {
      auto input_base_offset = TextureInfo::TiledOffset2DOuter(
          offset_y + y, (texture_info.size_2d.input_width /
                         texture_info.format_info()->block_width),
@ -872,8 +960,8 @@ bool TextureCache::UploadTexture2D(GLuint texture,
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, scratch_buffer_->handle());
  if (texture_info.is_compressed()) {
    glCompressedTextureSubImage2D(
-        texture, 0, 0, 0, texture_info.size_2d.output_width,
+        texture, 0, 0, 0, host_info.size_2d.output_width,
-        texture_info.size_2d.output_height, config.format,
+        host_info.size_2d.output_height, config.format,
        static_cast<GLsizei>(unpack_length),
        reinterpret_cast<void*>(unpack_offset));
  } else {
@ -883,8 +971,8 @@ bool TextureCache::UploadTexture2D(GLuint texture,
    // glPixelStorei(GL_UNPACK_ROW_LENGTH, texture_info.size_2d.input_width);
    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
-    glTextureSubImage2D(texture, 0, 0, 0, texture_info.size_2d.output_width,
+    glTextureSubImage2D(texture, 0, 0, 0, host_info.size_2d.output_width,
-                        texture_info.size_2d.output_height, config.format,
+                        host_info.size_2d.output_height, config.format,
                        config.type, reinterpret_cast<void*>(unpack_offset));
  }
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
@ -904,15 +992,21 @@ bool TextureCache::UploadTextureCube(GLuint texture,
    return false;
  }
-  size_t unpack_length = texture_info.output_length;
+  HostTextureInfo host_info;
  if (!HostTextureInfo::Setup(texture_info, &host_info)) {
    assert_always("Failed to set up host texture info");
    return false;
  }
  size_t unpack_length = host_info.output_length;
  glTextureStorage2D(texture, 1, config.internal_format,
-                     texture_info.size_cube.output_width,
+                     host_info.size_cube.output_width,
-                     texture_info.size_cube.output_height);
+                     host_info.size_cube.output_height);
  auto allocation = scratch_buffer_->Acquire(unpack_length);
  if (!texture_info.is_tiled) {
    if (texture_info.size_cube.input_pitch ==
-        texture_info.size_cube.output_pitch) {
+        host_info.size_cube.output_pitch) {
      // Fast path copy entire image.
      TextureSwap(texture_info.endianness, allocation.host_ptr, host_address,
                  unpack_length);
@ -924,11 +1018,11 @@ bool TextureCache::UploadTextureCube(GLuint texture,
      uint8_t* dest = reinterpret_cast<uint8_t*>(allocation.host_ptr);
      for (int face = 0; face < 6; ++face) {
        uint32_t pitch = std::min(texture_info.size_cube.input_pitch,
-                                  texture_info.size_cube.output_pitch);
+                                  host_info.size_cube.output_pitch);
        for (uint32_t y = 0; y < texture_info.size_cube.block_height; y++) {
          TextureSwap(texture_info.endianness, dest, src, pitch);
          src += texture_info.size_cube.input_pitch;
-          dest += texture_info.size_cube.output_pitch;
+          dest += host_info.size_cube.output_pitch;
        }
      }
    }
@ -948,7 +1042,7 @@ bool TextureCache::UploadTextureCube(GLuint texture,
    for (int face = 0; face < 6; ++face) {
      for (uint32_t y = 0, output_base_offset = 0;
           y < texture_info.size_cube.block_height;
-           y++, output_base_offset += texture_info.size_cube.output_pitch) {
+           y++, output_base_offset += host_info.size_cube.output_pitch) {
        auto input_base_offset = TextureInfo::TiledOffset2DOuter(
            offset_y + y, (texture_info.size_cube.input_width /
                           texture_info.format_info()->block_width),
@ -965,7 +1059,7 @@ bool TextureCache::UploadTextureCube(GLuint texture,
        }
      }
      src += texture_info.size_cube.input_face_length;
-      dest += texture_info.size_cube.output_face_length;
+      dest += host_info.size_cube.output_face_length;
    }
  }
  size_t unpack_offset = allocation.offset;
@ -977,8 +1071,8 @@ bool TextureCache::UploadTextureCube(GLuint texture,
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, scratch_buffer_->handle());
  if (texture_info.is_compressed()) {
    glCompressedTextureSubImage3D(
-        texture, 0, 0, 0, 0, texture_info.size_cube.output_width,
+        texture, 0, 0, 0, 0, host_info.size_cube.output_width,
-        texture_info.size_cube.output_height, 6, config.format,
+        host_info.size_cube.output_height, 6, config.format,
        static_cast<GLsizei>(unpack_length),
        reinterpret_cast<void*>(unpack_offset));
  } else {
@ -988,9 +1082,8 @@ bool TextureCache::UploadTextureCube(GLuint texture,
    // glPixelStorei(GL_UNPACK_ROW_LENGTH, texture_info.size_2d.input_width);
    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
-    glTextureSubImage3D(texture, 0, 0, 0, 0,
+    glTextureSubImage3D(texture, 0, 0, 0, 0, host_info.size_cube.output_width,
-                        texture_info.size_cube.output_width,
+                        host_info.size_cube.output_height, 6, config.format,
                        texture_info.size_cube.output_height, 6, config.format,
                        config.type, reinterpret_cast<void*>(unpack_offset));
  }
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
--- a/src/xenia/gpu/texture_info.cc
+++ b/src/xenia/gpu/texture_info.cc
@ -56,7 +56,6 @@ bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch,
  info.is_tiled = fetch.tiled;
  info.has_packed_mips = fetch.packed_mips;
  info.input_length = 0;  // Populated below.
  info.output_length = 0;
  if (info.format_info()->format == TextureFormat::kUnknown) {
    assert_true("Unsupported texture format");
@ -71,15 +70,6 @@ bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch,
    case Dimension::k2D: {
      info.CalculateTextureSizes2D(fetch.size_2d.width + 1,
                                   fetch.size_2d.height + 1);
      // DEBUG: Make sure our calculated pitch is equal to the fetch pitch.
      uint32_t bytes_per_block = info.format_info()->block_width *
                                 info.format_info()->block_height *
                                 info.format_info()->bits_per_pixel / 8;
      assert_true(info.size_2d.input_pitch ==
                  (bytes_per_block * fetch.pitch << 5) /
                      info.format_info()->block_width);
    } break;
    case Dimension::k3D: {
      // TODO(benvanik): calculate size.
@ -110,7 +100,6 @@ bool TextureInfo::PrepareResolve(uint32_t physical_address,
  info.is_tiled = true;
  info.has_packed_mips = false;
  info.input_length = 0;
  info.output_length = 0;
  if (info.format_info()->format == TextureFormat::kUnknown) {
    assert_true("Unsupported texture format");
@ -145,11 +134,6 @@ void TextureInfo::CalculateTextureSizes1D(uint32_t width) {
  size_1d.input_width = tile_width * 32 * format->block_width;
  size_1d.input_pitch = byte_pitch;
  input_length = size_1d.input_pitch;
  // TODO(DrChat): Remove this, leave it up to the backend.
  size_1d.output_width = block_width * format->block_width;
  size_1d.output_pitch = block_width * bytes_per_block;
  output_length = size_1d.output_pitch;
 }
 void TextureInfo::CalculateTextureSizes2D(uint32_t width, uint32_t height) {
@ -190,13 +174,6 @@ void TextureInfo::CalculateTextureSizes2D(uint32_t width, uint32_t height) {
  size_2d.input_pitch = byte_pitch;
  input_length = size_2d.input_pitch * size_2d.block_height;
  // TODO(DrChat): Remove this, leave it up to the backend.
  size_2d.output_width = block_width * format->block_width;
  size_2d.output_height = block_height * format->block_height;
  size_2d.output_pitch = block_width * bytes_per_block;
  output_length = size_2d.output_pitch * block_height;
 }
 void TextureInfo::CalculateTextureSizesCube(uint32_t width, uint32_t height,
@ -235,14 +212,6 @@ void TextureInfo::CalculateTextureSizesCube(uint32_t width, uint32_t height,
  size_cube.input_face_length = size_cube.input_pitch * size_cube.block_height;
  input_length = size_cube.input_face_length * 6;
  // TODO(DrChat): Remove this, leave it up to the backend.
  size_cube.output_width = block_width * format->block_width;
  size_cube.output_height = block_height * format->block_height;
  size_cube.output_pitch = block_width * bytes_per_block;
  size_cube.output_face_length = size_cube.output_pitch * block_height;
  output_length = size_cube.output_face_length * 6;
 }
 bool TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info,
--- a/src/xenia/gpu/texture_info.h
+++ b/src/xenia/gpu/texture_info.h
@ -256,7 +256,6 @@ struct TextureInfo {
  bool is_tiled;
  bool has_packed_mips;
  uint32_t input_length;
  uint32_t output_length;
  const FormatInfo* format_info() const {
    return FormatInfo::Get(static_cast<uint32_t>(texture_format));
@ -272,10 +271,6 @@ struct TextureInfo {
      uint32_t block_width;  // # of horizontal blocks
      uint32_t input_width;  // pixel pitch
      uint32_t input_pitch;  // pitch in bytes
      // DEPRECATED: Do not use.
      uint32_t output_width;
      uint32_t output_pitch;
    } size_1d;
    struct {
      uint32_t logical_width;
@ -285,11 +280,6 @@ struct TextureInfo {
      uint32_t input_width;   // pixel pitch
      uint32_t input_height;  // pixel height
      uint32_t input_pitch;   // pitch in bytes
      // DEPRECATED: Do not use.
      uint32_t output_width;
      uint32_t output_height;
      uint32_t output_pitch;
    } size_2d;
    struct {
    } size_3d;
@ -302,12 +292,6 @@ struct TextureInfo {
      uint32_t input_height;       // pixel height
      uint32_t input_pitch;        // pitch in bytes
      uint32_t input_face_length;  // pitch of face in bytes
      // DEPRECATED: Do not use.
      uint32_t output_width;
      uint32_t output_height;
      uint32_t output_pitch;
      uint32_t output_face_length;
    } size_cube;
  };
--- a/src/xenia/gpu/vulkan/texture_cache.cc
+++ b/src/xenia/gpu/vulkan/texture_cache.cc
@ -98,7 +98,7 @@ static const TextureConfig texture_configs[64] = {
    {TextureFormat::k_32_32_32_FLOAT, VK_FORMAT_R32G32B32_SFLOAT},
    {TextureFormat::k_DXT3A, VK_FORMAT_UNDEFINED},
    {TextureFormat::k_DXT5A, VK_FORMAT_UNDEFINED},
-    {TextureFormat::k_CTX1, VK_FORMAT_UNDEFINED},
+    {TextureFormat::k_CTX1, VK_FORMAT_R8G8_UINT},
    {TextureFormat::k_DXT3A_AS_1_1_1_1, VK_FORMAT_UNDEFINED},
    {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED},
    {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED},
@ -545,29 +545,7 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info,
  trace_writer_->WriteMemoryRead(texture_info.guest_address,
                                 texture_info.input_length);
-  bool uploaded = false;
+  if (!UploadTexture(command_buffer, completion_fence, texture, texture_info)) {
  switch (texture_info.dimension) {
    case Dimension::k1D: {
      uploaded = UploadTexture1D(command_buffer, completion_fence, texture,
                                 texture_info);
    } break;
    case Dimension::k2D: {
      uploaded = UploadTexture2D(command_buffer, completion_fence, texture,
                                 texture_info);
    } break;
    case Dimension::kCube: {
      uploaded = UploadTextureCube(command_buffer, completion_fence, texture,
                                   texture_info);
    } break;
    default:
      assert_unhandled_case(texture_info.dimension);
      break;
  }
  if (!uploaded) {
    FreeTexture(texture);
    return nullptr;
  }
@ -578,7 +556,7 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info,
      VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT,
      xe::format_string(
          "0x%.8X - 0x%.8X", texture_info.guest_address,
-          texture_info.guest_address + texture_info.output_length));
+          texture_info.guest_address + texture_info.input_length));
  // Okay. Now that the texture is uploaded from system memory, put a writewatch
  // on it to tell us if it's been modified from the guest.
@ -912,169 +890,230 @@ void TextureCache::FlushPendingCommands(VkCommandBuffer command_buffer,
  vkBeginCommandBuffer(command_buffer, &begin_info);
 }
-void TextureCache::ConvertTexture1D(uint8_t* dest, const TextureInfo& src) {
+bool TextureCache::ConvertTexture1D(uint8_t* dest,
                                    VkBufferImageCopy* copy_region,
                                    const TextureInfo& src) {
  void* host_address = memory_->TranslatePhysical(src.guest_address);
-  if (!src.is_tiled) {
+  if (src.texture_format == TextureFormat::k_CTX1) {
-    if (src.size_1d.input_pitch == src.size_1d.output_pitch) {
+    assert_always();
-      TextureSwap(src.endianness, dest, host_address, src.output_length);
+  } else {
    if (!src.is_tiled) {
      TextureSwap(src.endianness, dest, host_address, src.input_length);
      copy_region->bufferRowLength = src.size_1d.input_width;
      copy_region->bufferImageHeight = 1;
      copy_region->imageExtent = {src.size_1d.logical_width, 1, 1};
      return true;
    } else {
      assert_always();
    }
  } else {
    assert_always();
  }
  return false;
 }
-void TextureCache::ConvertTexture2D(uint8_t* dest, const TextureInfo& src) {
+bool TextureCache::ConvertTexture2D(uint8_t* dest,
                                    VkBufferImageCopy* copy_region,
                                    const TextureInfo& src) {
  void* host_address = memory_->TranslatePhysical(src.guest_address);
-  if (!src.is_tiled) {
+  if (src.texture_format == TextureFormat::k_CTX1) {
-    uint32_t offset_x, offset_y;
+    assert_always();
-    if (src.has_packed_mips &&
+  } else {
-        TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y)) {
+    if (!src.is_tiled) {
      uint32_t offset_x, offset_y;
      if (src.has_packed_mips &&
          TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y)) {
        uint32_t bytes_per_block = src.format_info()->block_width *
                                   src.format_info()->block_height *
                                   src.format_info()->bits_per_pixel / 8;
        const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
        src_mem += offset_y * src.size_2d.input_pitch;
        src_mem += offset_x * bytes_per_block;
        for (uint32_t y = 0;
             y < std::min(src.size_2d.block_height, src.size_2d.logical_height);
             y++) {
          TextureSwap(src.endianness, dest, src_mem, src.size_2d.input_pitch);
          src_mem += src.size_2d.input_pitch;
          dest += src.size_2d.input_pitch;
        }
        copy_region->bufferRowLength = src.size_2d.input_width;
        copy_region->bufferImageHeight = src.size_2d.input_height;
        copy_region->imageExtent = {src.size_2d.logical_width,
                                    src.size_2d.logical_height, 1};
        return true;
      } else {
        // Fast path copy entire image.
        TextureSwap(src.endianness, dest, host_address, src.input_length);
        copy_region->bufferRowLength = src.size_2d.input_width;
        copy_region->bufferImageHeight = src.size_2d.input_height;
        copy_region->imageExtent = {src.size_2d.logical_width,
                                    src.size_2d.logical_height, 1};
        return true;
      }
    } else {
      // Untile image.
      // We could do this in a shader to speed things up, as this is pretty
      // slow.
      // TODO(benvanik): optimize this inner loop (or work by tiles).
      const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
      uint32_t bytes_per_block = src.format_info()->block_width *
                                 src.format_info()->block_height *
                                 src.format_info()->bits_per_pixel / 8;
-      const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
+      // Tiled textures can be packed; get the offset into the packed texture.
-      src_mem += offset_y * src.size_2d.input_pitch;
+      uint32_t offset_x;
-      src_mem += offset_x * bytes_per_block;
+      uint32_t offset_y;
-      uint32_t pitch =
+      TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
-          std::min(src.size_2d.input_pitch, src.size_2d.output_pitch);
+      auto log2_bpp = (bytes_per_block >> 2) +
-      for (uint32_t y = 0;
+                      ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
           y < std::min(src.size_2d.block_height, src.size_2d.logical_height);
           y++) {
        TextureSwap(src.endianness, dest, src_mem, pitch);
        src_mem += src.size_2d.input_pitch;
        dest += src.size_2d.output_pitch;
      }
    } else if (src.size_2d.input_pitch == src.size_2d.output_pitch) {
      // Fast path copy entire image.
      TextureSwap(src.endianness, dest, host_address, src.output_length);
    } else {
      // Slow path copy row-by-row because strides differ.
      // UNPACK_ROW_LENGTH only works for uncompressed images, and likely does
      // this exact thing under the covers, so we just always do it here.
      const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
      uint32_t pitch =
          std::min(src.size_2d.input_pitch, src.size_2d.output_pitch);
      for (uint32_t y = 0;
           y < std::min(src.size_2d.block_height, src.size_2d.logical_height);
           y++) {
        TextureSwap(src.endianness, dest, src_mem, pitch);
        src_mem += src.size_2d.input_pitch;
        dest += src.size_2d.output_pitch;
      }
    }
  } else {
    // Untile image.
    // We could do this in a shader to speed things up, as this is pretty slow.
-    // TODO(benvanik): optimize this inner loop (or work by tiles).
+      // Offset to the current row, in bytes.
-    const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
+      uint32_t output_row_offset = 0;
-    uint32_t bytes_per_block = src.format_info()->block_width *
+      for (uint32_t y = 0; y < src.size_2d.block_height; y++) {
-                               src.format_info()->block_height *
+        auto input_row_offset = TextureInfo::TiledOffset2DOuter(
-                               src.format_info()->bits_per_pixel / 8;
+            offset_y + y, src.size_2d.block_width, log2_bpp);
-    // Tiled textures can be packed; get the offset into the packed texture.
+        // Go block-by-block on this row.
-    uint32_t offset_x;
+        uint32_t output_offset = output_row_offset;
-    uint32_t offset_y;
+        for (uint32_t x = 0; x < src.size_2d.block_width; x++) {
    TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
    auto log2_bpp = (bytes_per_block >> 2) +
                    ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
    // Offset to the current row, in bytes.
    uint32_t output_row_offset = 0;
    for (uint32_t y = 0; y < src.size_2d.block_height; y++) {
      auto input_row_offset = TextureInfo::TiledOffset2DOuter(
          offset_y + y, src.size_2d.block_width, log2_bpp);
      // Go block-by-block on this row.
      uint32_t output_offset = output_row_offset;
      for (uint32_t x = 0; x < src.size_2d.block_width; x++) {
        auto input_offset =
            TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y,
                                            log2_bpp, input_row_offset) >>
            log2_bpp;
        TextureSwap(src.endianness, dest + output_offset,
                    src_mem + input_offset * bytes_per_block, bytes_per_block);
        output_offset += bytes_per_block;
      }
      output_row_offset += src.size_2d.output_pitch;
    }
  }
 }
 void TextureCache::ConvertTextureCube(uint8_t* dest, const TextureInfo& src) {
  void* host_address = memory_->TranslatePhysical(src.guest_address);
  if (!src.is_tiled) {
    if (src.size_cube.input_pitch == src.size_cube.output_pitch) {
      // Fast path copy entire image.
      TextureSwap(src.endianness, dest, host_address, src.output_length);
    } else {
      // Slow path copy row-by-row because strides differ.
      // UNPACK_ROW_LENGTH only works for uncompressed images, and likely does
      // this exact thing under the covers, so we just always do it here.
      const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
      for (int face = 0; face < 6; ++face) {
        uint32_t pitch =
            std::min(src.size_cube.input_pitch, src.size_cube.output_pitch);
        for (uint32_t y = 0; y < src.size_cube.block_height; y++) {
          TextureSwap(src.endianness, dest, src_mem, pitch);
          src_mem += src.size_cube.input_pitch;
          dest += src.size_cube.output_pitch;
        }
      }
    }
  } else {
    // TODO(benvanik): optimize this inner loop (or work by tiles).
    const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
    uint32_t bytes_per_block = src.format_info()->block_width *
                               src.format_info()->block_height *
                               src.format_info()->bits_per_pixel / 8;
    // Tiled textures can be packed; get the offset into the packed texture.
    uint32_t offset_x;
    uint32_t offset_y;
    TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
    auto bpp = (bytes_per_block >> 2) +
               ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
    for (int face = 0; face < 6; ++face) {
      for (uint32_t y = 0, output_base_offset = 0;
           y < src.size_cube.block_height;
           y++, output_base_offset += src.size_cube.output_pitch) {
        auto input_base_offset = TextureInfo::TiledOffset2DOuter(
            offset_y + y,
            (src.size_cube.input_width / src.format_info()->block_width), bpp);
        for (uint32_t x = 0, output_offset = output_base_offset;
             x < src.size_cube.block_width;
             x++, output_offset += bytes_per_block) {
          auto input_offset =
-              TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp,
+              TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y,
-                                              input_base_offset) >>
+                                              log2_bpp, input_row_offset) >>
-              bpp;
+              log2_bpp;
          TextureSwap(src.endianness, dest + output_offset,
                      src_mem + input_offset * bytes_per_block,
                      bytes_per_block);
          output_offset += bytes_per_block;
        }
        output_row_offset += src.size_2d.input_pitch;
      }
-      src_mem += src.size_cube.input_face_length;
+
-      dest += src.size_cube.output_face_length;
+      copy_region->bufferRowLength = src.size_2d.input_width;
      copy_region->bufferImageHeight = src.size_2d.input_height;
      copy_region->imageExtent = {src.size_2d.logical_width,
                                  src.size_2d.logical_height, 1};
      return true;
    }
  }
  return false;
 }
 bool TextureCache::ConvertTextureCube(uint8_t* dest,
                                      VkBufferImageCopy* copy_region,
                                      const TextureInfo& src) {
  void* host_address = memory_->TranslatePhysical(src.guest_address);
  if (src.texture_format == TextureFormat::k_CTX1) {
    assert_always();
  } else {
    if (!src.is_tiled) {
      // Fast path copy entire image.
      TextureSwap(src.endianness, dest, host_address, src.input_length);
      copy_region->bufferRowLength = src.size_cube.input_width;
      copy_region->bufferImageHeight = src.size_cube.input_height;
      copy_region->imageExtent = {src.size_cube.logical_width,
                                  src.size_cube.logical_height, 6};
      return true;
    } else {
      // TODO(benvanik): optimize this inner loop (or work by tiles).
      const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
      uint32_t bytes_per_block = src.format_info()->block_width *
                                 src.format_info()->block_height *
                                 src.format_info()->bits_per_pixel / 8;
      // Tiled textures can be packed; get the offset into the packed texture.
      uint32_t offset_x;
      uint32_t offset_y;
      TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
      auto bpp = (bytes_per_block >> 2) +
                 ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
      for (int face = 0; face < 6; ++face) {
        for (uint32_t y = 0, output_base_offset = 0;
             y < src.size_cube.block_height;
             y++, output_base_offset += src.size_cube.input_pitch) {
          auto input_base_offset = TextureInfo::TiledOffset2DOuter(
              offset_y + y,
              (src.size_cube.input_width / src.format_info()->block_width),
              bpp);
          for (uint32_t x = 0, output_offset = output_base_offset;
               x < src.size_cube.block_width;
               x++, output_offset += bytes_per_block) {
            auto input_offset =
                TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp,
                                                input_base_offset) >>
                bpp;
            TextureSwap(src.endianness, dest + output_offset,
                        src_mem + input_offset * bytes_per_block,
                        bytes_per_block);
          }
        }
        src_mem += src.size_cube.input_face_length;
        dest += src.size_cube.input_face_length;
      }
      copy_region->bufferRowLength = src.size_cube.input_width;
      copy_region->bufferImageHeight = src.size_cube.input_height;
      copy_region->imageExtent = {src.size_cube.logical_width,
                                  src.size_cube.logical_height, 6};
      return true;
    }
  }
  return false;
 }
 bool TextureCache::ConvertTexture(uint8_t* dest, VkBufferImageCopy* copy_region,
                                  const TextureInfo& src) {
  switch (src.dimension) {
    case Dimension::k1D:
      return ConvertTexture1D(dest, copy_region, src);
    case Dimension::k2D:
      return ConvertTexture2D(dest, copy_region, src);
    case Dimension::kCube:
      return ConvertTextureCube(dest, copy_region, src);
  }
  return false;
 }
 bool TextureCache::ComputeTextureStorage(size_t* output_length,
                                         const TextureInfo& src) {
  if (src.texture_format == TextureFormat::k_CTX1) {
    switch (src.dimension) {
      case Dimension::k1D: {
        *output_length = src.size_1d.logical_width * 2;
        return true;
      }
      case Dimension::k2D: {
        *output_length =
            src.size_2d.logical_width * src.size_2d.logical_height * 2;
        return true;
      }
      case Dimension::kCube: {
        *output_length =
            src.size_cube.logical_width * src.size_cube.logical_height * 2 * 6;
        return true;
      }
    }
    return false;
  } else {
    *output_length = src.input_length;
    return true;
  }
 }
-bool TextureCache::UploadTexture1D(VkCommandBuffer command_buffer,
+bool TextureCache::UploadTexture(VkCommandBuffer command_buffer,
-                                   VkFence completion_fence, Texture* dest,
+                                 VkFence completion_fence, Texture* dest,
-                                   const TextureInfo& src) {
+                                 const TextureInfo& src) {
 #if FINE_GRAINED_DRAW_SCOPES
  SCOPE_profile_cpu_f("gpu");
 #endif  // FINE_GRAINED_DRAW_SCOPES
-  assert_true(src.dimension == Dimension::k1D);
+  size_t unpack_length;
  if (!ComputeTextureStorage(&unpack_length, src)) {
    XELOGW("Failed to compute texture storage");
    return false;
  }
  size_t unpack_length = src.output_length;
  if (!staging_buffer_.CanAcquire(unpack_length)) {
    // Need to have unique memory for every upload for at least one frame. If we
    // run out of memory, we need to flush all queued upload commands to the
@ -1100,14 +1139,20 @@ bool TextureCache::UploadTexture1D(VkCommandBuffer command_buffer,
  // TODO: If the GPU supports it, we can submit a compute batch to convert the
  // texture and copy it to its destination. Otherwise, fallback to conversion
  // on the CPU.
-  ConvertTexture1D(reinterpret_cast<uint8_t*>(alloc->host_ptr), src);
+  VkBufferImageCopy copy_region;
-  staging_buffer_.Flush(alloc);
+  if (!ConvertTexture(reinterpret_cast<uint8_t*>(alloc->host_ptr), &copy_region,
                      src)) {
    XELOGW("Failed to convert texture");
    return false;
  }
  // Transition the texture into a transfer destination layout.
  VkImageMemoryBarrier barrier;
  barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
  barrier.pNext = nullptr;
  barrier.srcAccessMask = 0;
  // TODO(gibbed): is this correct? 1D+cube had VK_ACCESS_HOST_WRITE_BIT, but
  // not 2D.
  barrier.dstAccessMask =
      VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT;
  barrier.oldLayout = dest->image_layout;
@ -1116,85 +1161,6 @@ bool TextureCache::UploadTexture1D(VkCommandBuffer command_buffer,
  barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
  barrier.image = dest->image;
  barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
                       nullptr, 1, &barrier);
  // Now move the converted texture into the destination.
  VkBufferImageCopy copy_region;
  copy_region.bufferOffset = alloc->offset;
  copy_region.bufferRowLength = src.size_1d.output_width;
  copy_region.bufferImageHeight = 1;
  copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
  copy_region.imageOffset = {0, 0, 0};
  copy_region.imageExtent = {src.size_1d.output_width, 1, 1};
  vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(),
                         dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1,
                         &copy_region);
  // Now transition the texture into a shader readonly source.
  barrier.srcAccessMask = barrier.dstAccessMask;
  barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
  barrier.oldLayout = barrier.newLayout;
  barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
                       nullptr, 1, &barrier);
  dest->image_layout = barrier.newLayout;
  return true;
 }
 bool TextureCache::UploadTexture2D(VkCommandBuffer command_buffer,
                                   VkFence completion_fence, Texture* dest,
                                   const TextureInfo& src) {
 #if FINE_GRAINED_DRAW_SCOPES
  SCOPE_profile_cpu_f("gpu");
 #endif  // FINE_GRAINED_DRAW_SCOPES
  assert_true(src.dimension == Dimension::k2D);
  size_t unpack_length = src.output_length;
  if (!staging_buffer_.CanAcquire(unpack_length)) {
    // Need to have unique memory for every upload for at least one frame. If we
    // run out of memory, we need to flush all queued upload commands to the
    // GPU.
    FlushPendingCommands(command_buffer, completion_fence);
    // Uploads have been flushed. Continue.
    if (!staging_buffer_.CanAcquire(unpack_length)) {
      // The staging buffer isn't big enough to hold this texture.
      XELOGE(
          "TextureCache staging buffer is too small! (uploading 0x%.8X bytes)",
          unpack_length);
      assert_always();
      return false;
    }
  }
  // Grab some temporary memory for staging.
  auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence);
  assert_not_null(alloc);
  // Upload texture into GPU memory.
  // TODO: If the GPU supports it, we can submit a compute batch to convert the
  // texture and copy it to its destination. Otherwise, fallback to conversion
  // on the CPU.
  ConvertTexture2D(reinterpret_cast<uint8_t*>(alloc->host_ptr), src);
  staging_buffer_.Flush(alloc);
  // Transition the texture into a transfer destination layout.
  VkImageMemoryBarrier barrier;
  barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
  barrier.pNext = nullptr;
  barrier.srcAccessMask = 0;
  barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
  barrier.oldLayout = dest->image_layout;
  barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
  barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
  barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
  barrier.image = dest->image;
  barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
  if (dest->format == VK_FORMAT_D16_UNORM_S8_UINT ||
      dest->format == VK_FORMAT_D24_UNORM_S8_UINT ||
      dest->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
@ -1207,91 +1173,9 @@ bool TextureCache::UploadTexture2D(VkCommandBuffer command_buffer,
                       nullptr, 1, &barrier);
  // Now move the converted texture into the destination.
  VkBufferImageCopy copy_region;
  copy_region.bufferOffset = alloc->offset;
  copy_region.bufferRowLength = src.size_2d.output_width;
  copy_region.bufferImageHeight = src.size_2d.output_height;
  copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
  copy_region.imageOffset = {0, 0, 0};
  copy_region.imageExtent = {src.size_2d.output_width,
                             src.size_2d.output_height, 1};
  vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(),
                         dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1,
                         &copy_region);
  // Now transition the texture into a shader readonly source.
  barrier.srcAccessMask = barrier.dstAccessMask;
  barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
  barrier.oldLayout = barrier.newLayout;
  barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
                       nullptr, 1, &barrier);
  dest->image_layout = barrier.newLayout;
  return true;
 }
 bool TextureCache::UploadTextureCube(VkCommandBuffer command_buffer,
                                     VkFence completion_fence, Texture* dest,
                                     const TextureInfo& src) {
  assert_true(src.dimension == Dimension::kCube);
  size_t unpack_length = src.output_length;
  if (!staging_buffer_.CanAcquire(unpack_length)) {
    // Need to have unique memory for every upload for at least one frame. If we
    // run out of memory, we need to flush all queued upload commands to the
    // GPU.
    FlushPendingCommands(command_buffer, completion_fence);
    // Uploads have been flushed. Continue.
    if (!staging_buffer_.CanAcquire(unpack_length)) {
      // The staging buffer isn't big enough to hold this texture.
      XELOGE(
          "TextureCache staging buffer is too small! (uploading 0x%.8X bytes)",
          unpack_length);
      assert_always();
      return false;
    }
  }
  // Grab some temporary memory for staging.
  auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence);
  assert_not_null(alloc);
  // Upload texture into GPU memory.
  // TODO: If the GPU supports it, we can submit a compute batch to convert the
  // texture and copy it to its destination. Otherwise, fallback to conversion
  // on the CPU.
  ConvertTextureCube(reinterpret_cast<uint8_t*>(alloc->host_ptr), src);
  staging_buffer_.Flush(alloc);
  // Transition the texture into a transfer destination layout.
  VkImageMemoryBarrier barrier;
  barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
  barrier.pNext = nullptr;
  barrier.srcAccessMask = 0;
  barrier.dstAccessMask =
      VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT;
  barrier.oldLayout = dest->image_layout;
  barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
  barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
  barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
  barrier.image = dest->image;
  barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};
  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0,
                       nullptr, 1, &barrier);
  // Now move the converted texture into the destination.
  VkBufferImageCopy copy_region;
  copy_region.bufferOffset = alloc->offset;
  copy_region.bufferRowLength = src.size_cube.output_width;
  copy_region.bufferImageHeight = src.size_cube.output_height;
  copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
  copy_region.imageOffset = {0, 0, 0};
  copy_region.imageExtent = {src.size_cube.output_width,
                             src.size_cube.output_height, 6};
  vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(),
                         dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1,
                         &copy_region);
--- a/src/xenia/gpu/vulkan/texture_cache.h
+++ b/src/xenia/gpu/vulkan/texture_cache.h
@ -142,22 +142,21 @@ class TextureCache {
  void FlushPendingCommands(VkCommandBuffer command_buffer,
                            VkFence completion_fence);
-  void ConvertTexture1D(uint8_t* dest, const TextureInfo& src);
+  bool ConvertTexture1D(uint8_t* dest, VkBufferImageCopy* copy_region,
-  void ConvertTexture2D(uint8_t* dest, const TextureInfo& src);
+                        const TextureInfo& src);
-  void ConvertTextureCube(uint8_t* dest, const TextureInfo& src);
+  bool ConvertTexture2D(uint8_t* dest, VkBufferImageCopy* copy_region,
                        const TextureInfo& src);
  bool ConvertTextureCube(uint8_t* dest, VkBufferImageCopy* copy_region,
                          const TextureInfo& src);
  bool ConvertTexture(uint8_t* dest, VkBufferImageCopy* copy_region,
                      const TextureInfo& src);
  bool ComputeTextureStorage(size_t* output_length, const TextureInfo& src);
  // Queues commands to upload a texture from system memory, applying any
  // conversions necessary. This may flush the command buffer to the GPU if we
  // run out of staging memory.
-  bool UploadTexture1D(VkCommandBuffer command_buffer, VkFence completion_fence,
+  bool UploadTexture(VkCommandBuffer command_buffer, VkFence completion_fence,
-                       Texture* dest, const TextureInfo& src);
+                     Texture* dest, const TextureInfo& src);
  bool UploadTexture2D(VkCommandBuffer command_buffer, VkFence completion_fence,
                       Texture* dest, const TextureInfo& src);
  bool UploadTextureCube(VkCommandBuffer command_buffer,
                         VkFence completion_fence, Texture* dest,
                         const TextureInfo& src);
  void HashTextureBindings(XXH64_state_t* hash_state, uint32_t& fetch_mask,
                           const std::vector<Shader::TextureBinding>& bindings);