From 7e819a4ccb6d8ef71e901cc8810ca339af92c732 Mon Sep 17 00:00:00 2001
From: DrChat <arkolbed@gmail.com>
Date: Sun, 20 Aug 2017 18:37:02 -0500
Subject: [PATCH] Vulkan: Refactor CTX1 conversion

---
 src/xenia/gpu/vulkan/texture_cache.cc | 371 +++++++++++---------------
 src/xenia/gpu/vulkan/texture_cache.h  |   3 +
 2 files changed, 160 insertions(+), 214 deletions(-)
diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc
index 661971ce7..8cd0070fb 100644
--- a/src/xenia/gpu/vulkan/texture_cache.cc
+++ b/src/xenia/gpu/vulkan/texture_cache.cc
@@ -363,6 +363,12 @@ TextureCache::Texture* TextureCache::DemandResolveTexture(
 
   // No texture at this location. Make a new one.
   auto texture = AllocateTexture(texture_info, required_flags);
+  if (!texture) {
+    // Failed to allocate texture (out of memory?)
+    assert_always();
+    XELOGE("Vulkan Texture Cache: Failed to allocate texture!");
+    return nullptr;
+  }
 
   // Setup a debug name for the texture.
   device_->DbgSetObjectName(
@@ -427,6 +433,7 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info,
   if (!texture) {
     // Failed to allocate texture (out of memory?)
     assert_always();
+    XELOGE("Vulkan Texture Cache: Failed to allocate texture!");
     return nullptr;
   }
 
@@ -843,112 +850,69 @@ void TextureCache::FlushPendingCommands(VkCommandBuffer command_buffer,
   vkBeginCommandBuffer(command_buffer, &begin_info);
 }
 
+void TextureCache::ConvertTexelCTX1(uint8_t* dest, size_t dest_pitch,
+                                    const uint8_t* src, Endian src_endianness) {
+  // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
+  union {
+    uint8_t data[8];
+    struct {
+      uint8_t r0, g0, r1, g1;
+      uint32_t xx;
+    };
+  } block;
+  static_assert(sizeof(block) == 8, "CTX1 block mismatch");
+
+  const uint32_t bytes_per_block = 8;
+  TextureSwap(src_endianness, block.data, src, bytes_per_block);
+
+  uint8_t cr[4] = {
+      block.r0, block.r1,
+      static_cast<uint8_t>(2.f / 3.f * block.r0 + 1.f / 3.f * block.r1),
+      static_cast<uint8_t>(1.f / 3.f * block.r0 + 2.f / 3.f * block.r1)};
+  uint8_t cg[4] = {
+      block.g0, block.g1,
+      static_cast<uint8_t>(2.f / 3.f * block.g0 + 1.f / 3.f * block.g1),
+      static_cast<uint8_t>(1.f / 3.f * block.g0 + 2.f / 3.f * block.g1)};
+
+  for (uint32_t oy = 0; oy < 4; ++oy) {
+    for (uint32_t ox = 0; ox < 4; ++ox) {
+      uint8_t xx = (block.xx >> (((ox + (oy * 4)) * 2))) & 3;
+      dest[(oy * dest_pitch) + (ox * 2) + 0] = cr[xx];
+      dest[(oy * dest_pitch) + (ox * 2) + 1] = cg[xx];
+    }
+  }
+}
+
 bool TextureCache::ConvertTexture2D(uint8_t* dest,
                                     VkBufferImageCopy* copy_region,
                                     const TextureInfo& src) {
   void* host_address = memory_->TranslatePhysical(src.guest_address);
-  if (src.texture_format == TextureFormat::k_CTX1) {
-    if (!src.is_tiled) {
-      assert_always();
-    } else {
-      // Untile image.
-      // We could do this in a shader to speed things up, as this is pretty
-      // slow.
+  if (!src.is_tiled) {
+    uint32_t offset_x, offset_y;
+    if (src.has_packed_mips &&
+        TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y)) {
+      uint32_t bytes_per_block = src.format_info()->block_width *
+                                 src.format_info()->block_height *
+                                 src.format_info()->bits_per_pixel / 8;
 
-      // TODO(benvanik): optimize this inner loop (or work by tiles).
       const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
-      const uint32_t bytes_per_block = 8;
-
-      // Tiled textures can be packed; get the offset into the packed texture.
-      uint32_t offset_x;
-      uint32_t offset_y;
-      TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
-      auto log2_bpp = (bytes_per_block >> 2) +
-                      ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
-
-      uint32_t output_pitch = src.size_2d.input_width * 2;
-      // Offset to the current row, in bytes.
-      uint32_t output_row_offset = 0;
-      for (uint32_t y = 0; y < src.size_2d.block_height; y++) {
-        auto input_row_offset = TextureInfo::TiledOffset2DOuter(
-            offset_y + y, src.size_2d.block_width, log2_bpp);
-
-        // Go block-by-block on this row.
-        uint32_t output_offset = output_row_offset;
-        for (uint32_t x = 0; x < src.size_2d.block_width;
-             x++, output_offset += 8) {
-          auto input_offset =
-              TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y,
-                                              log2_bpp, input_row_offset) >>
-              log2_bpp;
-
-          // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
-          union {
-            uint8_t data[8];
-            struct {
-              uint8_t r0, g0, r1, g1;
-              uint32_t xx;
-            };
-          } block;
-          static_assert(sizeof(block) == 8, "CTX1 block mismatch");
-
-          TextureSwap(src.endianness, block.data,
-                      src_mem + input_offset * bytes_per_block,
-                      bytes_per_block);
-
-          uint8_t cr[4] = {
-              block.r0, block.r1,
-              static_cast<uint8_t>(2.f / 3.f * block.r0 + 1.f / 3.f * block.r1),
-              static_cast<uint8_t>(1.f / 3.f * block.r0 +
-                                   2.f / 3.f * block.r1)};
-          uint8_t cg[4] = {
-              block.g0, block.g1,
-              static_cast<uint8_t>(2.f / 3.f * block.g0 + 1.f / 3.f * block.g1),
-              static_cast<uint8_t>(1.f / 3.f * block.g0 +
-                                   2.f / 3.f * block.g1)};
-
-          for (uint32_t oy = 0; oy < 4; ++oy) {
-            for (uint32_t ox = 0; ox < 4; ++ox) {
-              uint8_t xx = (block.xx >> (((ox + (oy * 4)) * 2))) & 3;
-              dest[output_offset + (oy * output_pitch) + (ox * 2) + 0] = cr[xx];
-              dest[output_offset + (oy * output_pitch) + (ox * 2) + 1] = cg[xx];
-            }
-          }
-        }
-        output_row_offset += output_pitch * 4;
+      src_mem += offset_y * src.size_2d.input_pitch;
+      src_mem += offset_x * bytes_per_block;
+      for (uint32_t y = 0;
+           y < std::min(src.size_2d.block_height, src.size_2d.logical_height);
+           y++) {
+        TextureSwap(src.endianness, dest, src_mem, src.size_2d.input_pitch);
+        src_mem += src.size_2d.input_pitch;
+        dest += src.size_2d.input_pitch;
       }
-
-#if 0
-      static int dds_counter = 0;
-      uint8_t dds_header[] = {
-          0x44, 0x44, 0x53, 0x20, 0x7C, 0x00, 0x00, 0x00, 0x07, 0x10, 0x00,
-          0x00, 0x58, 0x02, 0x00, 0x00, 0x20, 0x03, 0x00, 0x00, 0x00, 0x00,
-          0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20,
-          0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-          0x20, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00,
-          0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,
-          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
-      *((uint32_t*)(&dds_header[12])) = src.size_2d.input_height;
-      *((uint32_t*)(&dds_header[16])) = src.size_2d.input_width;
-
-      char dds_name[512];
-      sprintf(dds_name, "TEST_CTX1_%u.dds", ++dds_counter);
-      auto handle = fopen(dds_name, "wb");
-      fwrite(dds_header, sizeof(dds_header), 1, handle);
-      uint8_t dummy[2] = {0, 0};
-      for (uint32_t i = 0;
-           i < src.size_2d.input_width * src.size_2d.input_height * 2; i += 2) {
-        fwrite(&dest[i], 2, 1, handle);
-        fwrite(dummy, 2, 1, handle);
-      }
-      fclose(handle);
-#endif
-
+      copy_region->bufferRowLength = src.size_2d.input_width;
+      copy_region->bufferImageHeight = src.size_2d.input_height;
+      copy_region->imageExtent = {src.size_2d.logical_width,
+                                  src.size_2d.logical_height, 1};
+      return true;
+    } else {
+      // Fast path copy entire image.
+      TextureSwap(src.endianness, dest, host_address, src.input_length);
       copy_region->bufferRowLength = src.size_2d.input_width;
       copy_region->bufferImageHeight = src.size_2d.input_height;
       copy_region->imageExtent = {src.size_2d.logical_width,
@@ -956,87 +920,70 @@ bool TextureCache::ConvertTexture2D(uint8_t* dest,
       return true;
     }
   } else {
-    if (!src.is_tiled) {
-      uint32_t offset_x, offset_y;
-      if (src.has_packed_mips &&
-          TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y)) {
-        uint32_t bytes_per_block = src.format_info()->block_width *
-                                   src.format_info()->block_height *
-                                   src.format_info()->bits_per_pixel / 8;
+    // Untile image.
+    // We could do this in a shader to speed things up, as this is pretty
+    // slow.
 
-        const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
-        src_mem += offset_y * src.size_2d.input_pitch;
-        src_mem += offset_x * bytes_per_block;
-        for (uint32_t y = 0;
-             y < std::min(src.size_2d.block_height, src.size_2d.logical_height);
-             y++) {
-          TextureSwap(src.endianness, dest, src_mem, src.size_2d.input_pitch);
-          src_mem += src.size_2d.input_pitch;
-          dest += src.size_2d.input_pitch;
-        }
-        copy_region->bufferRowLength = src.size_2d.input_width;
-        copy_region->bufferImageHeight = src.size_2d.input_height;
-        copy_region->imageExtent = {src.size_2d.logical_width,
-                                    src.size_2d.logical_height, 1};
-        return true;
-      } else {
-        // Fast path copy entire image.
-        TextureSwap(src.endianness, dest, host_address, src.input_length);
-        copy_region->bufferRowLength = src.size_2d.input_width;
-        copy_region->bufferImageHeight = src.size_2d.input_height;
-        copy_region->imageExtent = {src.size_2d.logical_width,
-                                    src.size_2d.logical_height, 1};
-        return true;
-      }
-    } else {
-      // Untile image.
-      // We could do this in a shader to speed things up, as this is pretty
-      // slow.
+    // TODO(benvanik): optimize this inner loop (or work by tiles).
+    const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
+    uint32_t bytes_per_block = src.format_info()->block_width *
+                               src.format_info()->block_height *
+                               src.format_info()->bits_per_pixel / 8;
 
-      // TODO(benvanik): optimize this inner loop (or work by tiles).
-      const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
-      uint32_t bytes_per_block = src.format_info()->block_width *
-                                 src.format_info()->block_height *
-                                 src.format_info()->bits_per_pixel / 8;
+    uint32_t output_pitch = src.size_2d.input_width *
+                            src.format_info()->block_width *
+                            src.format_info()->bits_per_pixel / 8;
 
-      // Tiled textures can be packed; get the offset into the packed texture.
-      uint32_t offset_x;
-      uint32_t offset_y;
-      TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
-      auto log2_bpp = (bytes_per_block >> 2) +
-                      ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
+    uint32_t output_row_height = 1;
+    if (src.texture_format == TextureFormat::k_CTX1) {
+      // TODO: Can we calculate this?
+      output_row_height = 4;
+    }
 
-      // Offset to the current row, in bytes.
-      uint32_t output_row_offset = 0;
-      for (uint32_t y = 0; y < src.size_2d.block_height; y++) {
-        auto input_row_offset = TextureInfo::TiledOffset2DOuter(
-            offset_y + y, src.size_2d.block_width, log2_bpp);
+    // Tiled textures can be packed; get the offset into the packed texture.
+    uint32_t offset_x;
+    uint32_t offset_y;
+    TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
+    auto log2_bpp = (bytes_per_block >> 2) +
+                    ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
 
-        // Go block-by-block on this row.
-        uint32_t output_offset = output_row_offset;
-        for (uint32_t x = 0; x < src.size_2d.block_width; x++) {
-          auto input_offset =
-              TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y,
-                                              log2_bpp, input_row_offset) >>
-              log2_bpp;
+    // Offset to the current row, in bytes.
+    uint32_t output_row_offset = 0;
+    for (uint32_t y = 0; y < src.size_2d.block_height; y++) {
+      auto input_row_offset = TextureInfo::TiledOffset2DOuter(
+          offset_y + y, src.size_2d.block_width, log2_bpp);
 
+      // Go block-by-block on this row.
+      uint32_t output_offset = output_row_offset;
+      for (uint32_t x = 0; x < src.size_2d.block_width; x++) {
+        auto input_offset = TextureInfo::TiledOffset2DInner(
+            offset_x + x, offset_y + y, log2_bpp, input_row_offset);
+        input_offset >>= log2_bpp;
+
+        if (src.texture_format == TextureFormat::k_CTX1) {
+          // Convert to R8G8.
+          ConvertTexelCTX1(&dest[output_offset], output_pitch, src_mem,
+                           src.endianness);
+        } else {
+          // Generic swap to destination.
           TextureSwap(src.endianness, dest + output_offset,
                       src_mem + input_offset * bytes_per_block,
                       bytes_per_block);
-
-          output_offset += bytes_per_block;
         }
 
-        output_row_offset += src.size_2d.input_pitch;
+        output_offset += bytes_per_block;
       }
 
-      copy_region->bufferRowLength = src.size_2d.input_width;
-      copy_region->bufferImageHeight = src.size_2d.input_height;
-      copy_region->imageExtent = {src.size_2d.logical_width,
-                                  src.size_2d.logical_height, 1};
-      return true;
+      output_row_offset += output_pitch * output_row_height;
     }
+
+    copy_region->bufferRowLength = src.size_2d.input_width;
+    copy_region->bufferImageHeight = src.size_2d.input_height;
+    copy_region->imageExtent = {src.size_2d.logical_width,
+                                src.size_2d.logical_height, 1};
+    return true;
   }
+
   return false;
 }
 
@@ -1044,60 +991,56 @@ bool TextureCache::ConvertTextureCube(uint8_t* dest,
                                       VkBufferImageCopy* copy_region,
                                       const TextureInfo& src) {
   void* host_address = memory_->TranslatePhysical(src.guest_address);
-  if (src.texture_format == TextureFormat::k_CTX1) {
-    assert_always();
+  if (!src.is_tiled) {
+    // Fast path copy entire image.
+    TextureSwap(src.endianness, dest, host_address, src.input_length);
+    copy_region->bufferRowLength = src.size_cube.input_width;
+    copy_region->bufferImageHeight = src.size_cube.input_height;
+    copy_region->imageExtent = {src.size_cube.logical_width,
+                                src.size_cube.logical_height, 6};
+    return true;
   } else {
-    if (!src.is_tiled) {
-      // Fast path copy entire image.
-      TextureSwap(src.endianness, dest, host_address, src.input_length);
-      copy_region->bufferRowLength = src.size_cube.input_width;
-      copy_region->bufferImageHeight = src.size_cube.input_height;
-      copy_region->imageExtent = {src.size_cube.logical_width,
-                                  src.size_cube.logical_height, 6};
-      return true;
-    } else {
-      // TODO(benvanik): optimize this inner loop (or work by tiles).
-      const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
-      uint32_t bytes_per_block = src.format_info()->block_width *
-                                 src.format_info()->block_height *
-                                 src.format_info()->bits_per_pixel / 8;
-      // Tiled textures can be packed; get the offset into the packed texture.
-      uint32_t offset_x;
-      uint32_t offset_y;
-      TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
-      auto bpp = (bytes_per_block >> 2) +
-                 ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
-      for (int face = 0; face < 6; ++face) {
-        for (uint32_t y = 0, output_base_offset = 0;
-             y < src.size_cube.block_height;
-             y++, output_base_offset += src.size_cube.input_pitch) {
-          auto input_base_offset = TextureInfo::TiledOffset2DOuter(
-              offset_y + y,
-              (src.size_cube.input_width / src.format_info()->block_width),
-              bpp);
-          for (uint32_t x = 0, output_offset = output_base_offset;
-               x < src.size_cube.block_width;
-               x++, output_offset += bytes_per_block) {
-            auto input_offset =
-                TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp,
-                                                input_base_offset) >>
-                bpp;
-            TextureSwap(src.endianness, dest + output_offset,
-                        src_mem + input_offset * bytes_per_block,
-                        bytes_per_block);
-          }
+    // TODO(benvanik): optimize this inner loop (or work by tiles).
+    const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
+    uint32_t bytes_per_block = src.format_info()->block_width *
+                               src.format_info()->block_height *
+                               src.format_info()->bits_per_pixel / 8;
+    // Tiled textures can be packed; get the offset into the packed texture.
+    uint32_t offset_x;
+    uint32_t offset_y;
+    TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
+    auto bpp = (bytes_per_block >> 2) +
+               ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
+    for (int face = 0; face < 6; ++face) {
+      for (uint32_t y = 0, output_base_offset = 0;
+           y < src.size_cube.block_height;
+           y++, output_base_offset += src.size_cube.input_pitch) {
+        auto input_base_offset = TextureInfo::TiledOffset2DOuter(
+            offset_y + y,
+            (src.size_cube.input_width / src.format_info()->block_width), bpp);
+        for (uint32_t x = 0, output_offset = output_base_offset;
+             x < src.size_cube.block_width;
+             x++, output_offset += bytes_per_block) {
+          auto input_offset =
+              TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp,
+                                              input_base_offset) >>
+              bpp;
+          TextureSwap(src.endianness, dest + output_offset,
+                      src_mem + input_offset * bytes_per_block,
+                      bytes_per_block);
         }
-        src_mem += src.size_cube.input_face_length;
-        dest += src.size_cube.input_face_length;
       }
-
-      copy_region->bufferRowLength = src.size_cube.input_width;
-      copy_region->bufferImageHeight = src.size_cube.input_height;
-      copy_region->imageExtent = {src.size_cube.logical_width,
-                                  src.size_cube.logical_height, 6};
-      return true;
+      src_mem += src.size_cube.input_face_length;
+      dest += src.size_cube.input_face_length;
     }
+
+    copy_region->bufferRowLength = src.size_cube.input_width;
+    copy_region->bufferImageHeight = src.size_cube.input_height;
+    copy_region->imageExtent = {src.size_cube.logical_width,
+                                src.size_cube.logical_height, 6};
+    return true;
   }
+
   return false;
 }
 
diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h
index 75ce5cb74..f2c5c9839 100644
--- a/src/xenia/gpu/vulkan/texture_cache.h
+++ b/src/xenia/gpu/vulkan/texture_cache.h
@@ -140,6 +140,9 @@ class TextureCache {
   void FlushPendingCommands(VkCommandBuffer command_buffer,
                             VkFence completion_fence);
 
+  static void ConvertTexelCTX1(uint8_t* dest, size_t dest_pitch,
+                               const uint8_t* src, Endian src_endianness);
+
   bool ConvertTexture2D(uint8_t* dest, VkBufferImageCopy* copy_region,
                         const TextureInfo& src);
   bool ConvertTextureCube(uint8_t* dest, VkBufferImageCopy* copy_region,