From 7e819a4ccb6d8ef71e901cc8810ca339af92c732 Mon Sep 17 00:00:00 2001 From: DrChat Date: Sun, 20 Aug 2017 18:37:02 -0500 Subject: [PATCH] Vulkan: Refactor CTX1 conversion --- src/xenia/gpu/vulkan/texture_cache.cc | 371 +++++++++++--------------- src/xenia/gpu/vulkan/texture_cache.h | 3 + 2 files changed, 160 insertions(+), 214 deletions(-) diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc index 661971ce7..8cd0070fb 100644 --- a/src/xenia/gpu/vulkan/texture_cache.cc +++ b/src/xenia/gpu/vulkan/texture_cache.cc @@ -363,6 +363,12 @@ TextureCache::Texture* TextureCache::DemandResolveTexture( // No texture at this location. Make a new one. auto texture = AllocateTexture(texture_info, required_flags); + if (!texture) { + // Failed to allocate texture (out of memory?) + assert_always(); + XELOGE("Vulkan Texture Cache: Failed to allocate texture!"); + return nullptr; + } // Setup a debug name for the texture. device_->DbgSetObjectName( @@ -427,6 +433,7 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info, if (!texture) { // Failed to allocate texture (out of memory?) assert_always(); + XELOGE("Vulkan Texture Cache: Failed to allocate texture!"); return nullptr; } @@ -843,112 +850,69 @@ void TextureCache::FlushPendingCommands(VkCommandBuffer command_buffer, vkBeginCommandBuffer(command_buffer, &begin_info); } +void TextureCache::ConvertTexelCTX1(uint8_t* dest, size_t dest_pitch, + const uint8_t* src, Endian src_endianness) { + // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf + union { + uint8_t data[8]; + struct { + uint8_t r0, g0, r1, g1; + uint32_t xx; + }; + } block; + static_assert(sizeof(block) == 8, "CTX1 block mismatch"); + + const uint32_t bytes_per_block = 8; + TextureSwap(src_endianness, block.data, src, bytes_per_block); + + uint8_t cr[4] = { + block.r0, block.r1, + static_cast(2.f / 3.f * block.r0 + 1.f / 3.f * block.r1), + static_cast(1.f / 3.f * block.r0 + 2.f / 3.f * block.r1)}; + uint8_t cg[4] = { + block.g0, block.g1, + static_cast(2.f / 3.f * block.g0 + 1.f / 3.f * block.g1), + static_cast(1.f / 3.f * block.g0 + 2.f / 3.f * block.g1)}; + + for (uint32_t oy = 0; oy < 4; ++oy) { + for (uint32_t ox = 0; ox < 4; ++ox) { + uint8_t xx = (block.xx >> (((ox + (oy * 4)) * 2))) & 3; + dest[(oy * dest_pitch) + (ox * 2) + 0] = cr[xx]; + dest[(oy * dest_pitch) + (ox * 2) + 1] = cg[xx]; + } + } +} + bool TextureCache::ConvertTexture2D(uint8_t* dest, VkBufferImageCopy* copy_region, const TextureInfo& src) { void* host_address = memory_->TranslatePhysical(src.guest_address); - if (src.texture_format == TextureFormat::k_CTX1) { - if (!src.is_tiled) { - assert_always(); - } else { - // Untile image. - // We could do this in a shader to speed things up, as this is pretty - // slow. + if (!src.is_tiled) { + uint32_t offset_x, offset_y; + if (src.has_packed_mips && + TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y)) { + uint32_t bytes_per_block = src.format_info()->block_width * + src.format_info()->block_height * + src.format_info()->bits_per_pixel / 8; - // TODO(benvanik): optimize this inner loop (or work by tiles). const uint8_t* src_mem = reinterpret_cast(host_address); - const uint32_t bytes_per_block = 8; - - // Tiled textures can be packed; get the offset into the packed texture. - uint32_t offset_x; - uint32_t offset_y; - TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); - auto log2_bpp = (bytes_per_block >> 2) + - ((bytes_per_block >> 1) >> (bytes_per_block >> 2)); - - uint32_t output_pitch = src.size_2d.input_width * 2; - // Offset to the current row, in bytes. - uint32_t output_row_offset = 0; - for (uint32_t y = 0; y < src.size_2d.block_height; y++) { - auto input_row_offset = TextureInfo::TiledOffset2DOuter( - offset_y + y, src.size_2d.block_width, log2_bpp); - - // Go block-by-block on this row. - uint32_t output_offset = output_row_offset; - for (uint32_t x = 0; x < src.size_2d.block_width; - x++, output_offset += 8) { - auto input_offset = - TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, - log2_bpp, input_row_offset) >> - log2_bpp; - - // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf - union { - uint8_t data[8]; - struct { - uint8_t r0, g0, r1, g1; - uint32_t xx; - }; - } block; - static_assert(sizeof(block) == 8, "CTX1 block mismatch"); - - TextureSwap(src.endianness, block.data, - src_mem + input_offset * bytes_per_block, - bytes_per_block); - - uint8_t cr[4] = { - block.r0, block.r1, - static_cast(2.f / 3.f * block.r0 + 1.f / 3.f * block.r1), - static_cast(1.f / 3.f * block.r0 + - 2.f / 3.f * block.r1)}; - uint8_t cg[4] = { - block.g0, block.g1, - static_cast(2.f / 3.f * block.g0 + 1.f / 3.f * block.g1), - static_cast(1.f / 3.f * block.g0 + - 2.f / 3.f * block.g1)}; - - for (uint32_t oy = 0; oy < 4; ++oy) { - for (uint32_t ox = 0; ox < 4; ++ox) { - uint8_t xx = (block.xx >> (((ox + (oy * 4)) * 2))) & 3; - dest[output_offset + (oy * output_pitch) + (ox * 2) + 0] = cr[xx]; - dest[output_offset + (oy * output_pitch) + (ox * 2) + 1] = cg[xx]; - } - } - } - output_row_offset += output_pitch * 4; + src_mem += offset_y * src.size_2d.input_pitch; + src_mem += offset_x * bytes_per_block; + for (uint32_t y = 0; + y < std::min(src.size_2d.block_height, src.size_2d.logical_height); + y++) { + TextureSwap(src.endianness, dest, src_mem, src.size_2d.input_pitch); + src_mem += src.size_2d.input_pitch; + dest += src.size_2d.input_pitch; } - -#if 0 - static int dds_counter = 0; - uint8_t dds_header[] = { - 0x44, 0x44, 0x53, 0x20, 0x7C, 0x00, 0x00, 0x00, 0x07, 0x10, 0x00, - 0x00, 0x58, 0x02, 0x00, 0x00, 0x20, 0x03, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, - 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, - 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; - *((uint32_t*)(&dds_header[12])) = src.size_2d.input_height; - *((uint32_t*)(&dds_header[16])) = src.size_2d.input_width; - - char dds_name[512]; - sprintf(dds_name, "TEST_CTX1_%u.dds", ++dds_counter); - auto handle = fopen(dds_name, "wb"); - fwrite(dds_header, sizeof(dds_header), 1, handle); - uint8_t dummy[2] = {0, 0}; - for (uint32_t i = 0; - i < src.size_2d.input_width * src.size_2d.input_height * 2; i += 2) { - fwrite(&dest[i], 2, 1, handle); - fwrite(dummy, 2, 1, handle); - } - fclose(handle); -#endif - + copy_region->bufferRowLength = src.size_2d.input_width; + copy_region->bufferImageHeight = src.size_2d.input_height; + copy_region->imageExtent = {src.size_2d.logical_width, + src.size_2d.logical_height, 1}; + return true; + } else { + // Fast path copy entire image. + TextureSwap(src.endianness, dest, host_address, src.input_length); copy_region->bufferRowLength = src.size_2d.input_width; copy_region->bufferImageHeight = src.size_2d.input_height; copy_region->imageExtent = {src.size_2d.logical_width, @@ -956,87 +920,70 @@ bool TextureCache::ConvertTexture2D(uint8_t* dest, return true; } } else { - if (!src.is_tiled) { - uint32_t offset_x, offset_y; - if (src.has_packed_mips && - TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y)) { - uint32_t bytes_per_block = src.format_info()->block_width * - src.format_info()->block_height * - src.format_info()->bits_per_pixel / 8; + // Untile image. + // We could do this in a shader to speed things up, as this is pretty + // slow. - const uint8_t* src_mem = reinterpret_cast(host_address); - src_mem += offset_y * src.size_2d.input_pitch; - src_mem += offset_x * bytes_per_block; - for (uint32_t y = 0; - y < std::min(src.size_2d.block_height, src.size_2d.logical_height); - y++) { - TextureSwap(src.endianness, dest, src_mem, src.size_2d.input_pitch); - src_mem += src.size_2d.input_pitch; - dest += src.size_2d.input_pitch; - } - copy_region->bufferRowLength = src.size_2d.input_width; - copy_region->bufferImageHeight = src.size_2d.input_height; - copy_region->imageExtent = {src.size_2d.logical_width, - src.size_2d.logical_height, 1}; - return true; - } else { - // Fast path copy entire image. - TextureSwap(src.endianness, dest, host_address, src.input_length); - copy_region->bufferRowLength = src.size_2d.input_width; - copy_region->bufferImageHeight = src.size_2d.input_height; - copy_region->imageExtent = {src.size_2d.logical_width, - src.size_2d.logical_height, 1}; - return true; - } - } else { - // Untile image. - // We could do this in a shader to speed things up, as this is pretty - // slow. + // TODO(benvanik): optimize this inner loop (or work by tiles). + const uint8_t* src_mem = reinterpret_cast(host_address); + uint32_t bytes_per_block = src.format_info()->block_width * + src.format_info()->block_height * + src.format_info()->bits_per_pixel / 8; - // TODO(benvanik): optimize this inner loop (or work by tiles). - const uint8_t* src_mem = reinterpret_cast(host_address); - uint32_t bytes_per_block = src.format_info()->block_width * - src.format_info()->block_height * - src.format_info()->bits_per_pixel / 8; + uint32_t output_pitch = src.size_2d.input_width * + src.format_info()->block_width * + src.format_info()->bits_per_pixel / 8; - // Tiled textures can be packed; get the offset into the packed texture. - uint32_t offset_x; - uint32_t offset_y; - TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); - auto log2_bpp = (bytes_per_block >> 2) + - ((bytes_per_block >> 1) >> (bytes_per_block >> 2)); + uint32_t output_row_height = 1; + if (src.texture_format == TextureFormat::k_CTX1) { + // TODO: Can we calculate this? + output_row_height = 4; + } - // Offset to the current row, in bytes. - uint32_t output_row_offset = 0; - for (uint32_t y = 0; y < src.size_2d.block_height; y++) { - auto input_row_offset = TextureInfo::TiledOffset2DOuter( - offset_y + y, src.size_2d.block_width, log2_bpp); + // Tiled textures can be packed; get the offset into the packed texture. + uint32_t offset_x; + uint32_t offset_y; + TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); + auto log2_bpp = (bytes_per_block >> 2) + + ((bytes_per_block >> 1) >> (bytes_per_block >> 2)); - // Go block-by-block on this row. - uint32_t output_offset = output_row_offset; - for (uint32_t x = 0; x < src.size_2d.block_width; x++) { - auto input_offset = - TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, - log2_bpp, input_row_offset) >> - log2_bpp; + // Offset to the current row, in bytes. + uint32_t output_row_offset = 0; + for (uint32_t y = 0; y < src.size_2d.block_height; y++) { + auto input_row_offset = TextureInfo::TiledOffset2DOuter( + offset_y + y, src.size_2d.block_width, log2_bpp); + // Go block-by-block on this row. + uint32_t output_offset = output_row_offset; + for (uint32_t x = 0; x < src.size_2d.block_width; x++) { + auto input_offset = TextureInfo::TiledOffset2DInner( + offset_x + x, offset_y + y, log2_bpp, input_row_offset); + input_offset >>= log2_bpp; + + if (src.texture_format == TextureFormat::k_CTX1) { + // Convert to R8G8. + ConvertTexelCTX1(&dest[output_offset], output_pitch, src_mem, + src.endianness); + } else { + // Generic swap to destination. TextureSwap(src.endianness, dest + output_offset, src_mem + input_offset * bytes_per_block, bytes_per_block); - - output_offset += bytes_per_block; } - output_row_offset += src.size_2d.input_pitch; + output_offset += bytes_per_block; } - copy_region->bufferRowLength = src.size_2d.input_width; - copy_region->bufferImageHeight = src.size_2d.input_height; - copy_region->imageExtent = {src.size_2d.logical_width, - src.size_2d.logical_height, 1}; - return true; + output_row_offset += output_pitch * output_row_height; } + + copy_region->bufferRowLength = src.size_2d.input_width; + copy_region->bufferImageHeight = src.size_2d.input_height; + copy_region->imageExtent = {src.size_2d.logical_width, + src.size_2d.logical_height, 1}; + return true; } + return false; } @@ -1044,60 +991,56 @@ bool TextureCache::ConvertTextureCube(uint8_t* dest, VkBufferImageCopy* copy_region, const TextureInfo& src) { void* host_address = memory_->TranslatePhysical(src.guest_address); - if (src.texture_format == TextureFormat::k_CTX1) { - assert_always(); + if (!src.is_tiled) { + // Fast path copy entire image. + TextureSwap(src.endianness, dest, host_address, src.input_length); + copy_region->bufferRowLength = src.size_cube.input_width; + copy_region->bufferImageHeight = src.size_cube.input_height; + copy_region->imageExtent = {src.size_cube.logical_width, + src.size_cube.logical_height, 6}; + return true; } else { - if (!src.is_tiled) { - // Fast path copy entire image. - TextureSwap(src.endianness, dest, host_address, src.input_length); - copy_region->bufferRowLength = src.size_cube.input_width; - copy_region->bufferImageHeight = src.size_cube.input_height; - copy_region->imageExtent = {src.size_cube.logical_width, - src.size_cube.logical_height, 6}; - return true; - } else { - // TODO(benvanik): optimize this inner loop (or work by tiles). - const uint8_t* src_mem = reinterpret_cast(host_address); - uint32_t bytes_per_block = src.format_info()->block_width * - src.format_info()->block_height * - src.format_info()->bits_per_pixel / 8; - // Tiled textures can be packed; get the offset into the packed texture. - uint32_t offset_x; - uint32_t offset_y; - TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); - auto bpp = (bytes_per_block >> 2) + - ((bytes_per_block >> 1) >> (bytes_per_block >> 2)); - for (int face = 0; face < 6; ++face) { - for (uint32_t y = 0, output_base_offset = 0; - y < src.size_cube.block_height; - y++, output_base_offset += src.size_cube.input_pitch) { - auto input_base_offset = TextureInfo::TiledOffset2DOuter( - offset_y + y, - (src.size_cube.input_width / src.format_info()->block_width), - bpp); - for (uint32_t x = 0, output_offset = output_base_offset; - x < src.size_cube.block_width; - x++, output_offset += bytes_per_block) { - auto input_offset = - TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp, - input_base_offset) >> - bpp; - TextureSwap(src.endianness, dest + output_offset, - src_mem + input_offset * bytes_per_block, - bytes_per_block); - } + // TODO(benvanik): optimize this inner loop (or work by tiles). + const uint8_t* src_mem = reinterpret_cast(host_address); + uint32_t bytes_per_block = src.format_info()->block_width * + src.format_info()->block_height * + src.format_info()->bits_per_pixel / 8; + // Tiled textures can be packed; get the offset into the packed texture. + uint32_t offset_x; + uint32_t offset_y; + TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); + auto bpp = (bytes_per_block >> 2) + + ((bytes_per_block >> 1) >> (bytes_per_block >> 2)); + for (int face = 0; face < 6; ++face) { + for (uint32_t y = 0, output_base_offset = 0; + y < src.size_cube.block_height; + y++, output_base_offset += src.size_cube.input_pitch) { + auto input_base_offset = TextureInfo::TiledOffset2DOuter( + offset_y + y, + (src.size_cube.input_width / src.format_info()->block_width), bpp); + for (uint32_t x = 0, output_offset = output_base_offset; + x < src.size_cube.block_width; + x++, output_offset += bytes_per_block) { + auto input_offset = + TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp, + input_base_offset) >> + bpp; + TextureSwap(src.endianness, dest + output_offset, + src_mem + input_offset * bytes_per_block, + bytes_per_block); } - src_mem += src.size_cube.input_face_length; - dest += src.size_cube.input_face_length; } - - copy_region->bufferRowLength = src.size_cube.input_width; - copy_region->bufferImageHeight = src.size_cube.input_height; - copy_region->imageExtent = {src.size_cube.logical_width, - src.size_cube.logical_height, 6}; - return true; + src_mem += src.size_cube.input_face_length; + dest += src.size_cube.input_face_length; } + + copy_region->bufferRowLength = src.size_cube.input_width; + copy_region->bufferImageHeight = src.size_cube.input_height; + copy_region->imageExtent = {src.size_cube.logical_width, + src.size_cube.logical_height, 6}; + return true; } + return false; } diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h index 75ce5cb74..f2c5c9839 100644 --- a/src/xenia/gpu/vulkan/texture_cache.h +++ b/src/xenia/gpu/vulkan/texture_cache.h @@ -140,6 +140,9 @@ class TextureCache { void FlushPendingCommands(VkCommandBuffer command_buffer, VkFence completion_fence); + static void ConvertTexelCTX1(uint8_t* dest, size_t dest_pitch, + const uint8_t* src, Endian src_endianness); + bool ConvertTexture2D(uint8_t* dest, VkBufferImageCopy* copy_region, const TextureInfo& src); bool ConvertTextureCube(uint8_t* dest, VkBufferImageCopy* copy_region,