Vulkan: Refactor CTX1 conversion

This commit is contained in:
DrChat 2017-08-20 18:37:02 -05:00
parent 1eac03a11c
commit 7e819a4ccb
2 changed files with 160 additions and 214 deletions

View File

@ -363,6 +363,12 @@ TextureCache::Texture* TextureCache::DemandResolveTexture(
// No texture at this location. Make a new one. // No texture at this location. Make a new one.
auto texture = AllocateTexture(texture_info, required_flags); auto texture = AllocateTexture(texture_info, required_flags);
if (!texture) {
// Failed to allocate texture (out of memory?)
assert_always();
XELOGE("Vulkan Texture Cache: Failed to allocate texture!");
return nullptr;
}
// Setup a debug name for the texture. // Setup a debug name for the texture.
device_->DbgSetObjectName( device_->DbgSetObjectName(
@ -427,6 +433,7 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info,
if (!texture) { if (!texture) {
// Failed to allocate texture (out of memory?) // Failed to allocate texture (out of memory?)
assert_always(); assert_always();
XELOGE("Vulkan Texture Cache: Failed to allocate texture!");
return nullptr; return nullptr;
} }
@ -843,112 +850,69 @@ void TextureCache::FlushPendingCommands(VkCommandBuffer command_buffer,
vkBeginCommandBuffer(command_buffer, &begin_info); vkBeginCommandBuffer(command_buffer, &begin_info);
} }
void TextureCache::ConvertTexelCTX1(uint8_t* dest, size_t dest_pitch,
const uint8_t* src, Endian src_endianness) {
// http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
union {
uint8_t data[8];
struct {
uint8_t r0, g0, r1, g1;
uint32_t xx;
};
} block;
static_assert(sizeof(block) == 8, "CTX1 block mismatch");
const uint32_t bytes_per_block = 8;
TextureSwap(src_endianness, block.data, src, bytes_per_block);
uint8_t cr[4] = {
block.r0, block.r1,
static_cast<uint8_t>(2.f / 3.f * block.r0 + 1.f / 3.f * block.r1),
static_cast<uint8_t>(1.f / 3.f * block.r0 + 2.f / 3.f * block.r1)};
uint8_t cg[4] = {
block.g0, block.g1,
static_cast<uint8_t>(2.f / 3.f * block.g0 + 1.f / 3.f * block.g1),
static_cast<uint8_t>(1.f / 3.f * block.g0 + 2.f / 3.f * block.g1)};
for (uint32_t oy = 0; oy < 4; ++oy) {
for (uint32_t ox = 0; ox < 4; ++ox) {
uint8_t xx = (block.xx >> (((ox + (oy * 4)) * 2))) & 3;
dest[(oy * dest_pitch) + (ox * 2) + 0] = cr[xx];
dest[(oy * dest_pitch) + (ox * 2) + 1] = cg[xx];
}
}
}
bool TextureCache::ConvertTexture2D(uint8_t* dest, bool TextureCache::ConvertTexture2D(uint8_t* dest,
VkBufferImageCopy* copy_region, VkBufferImageCopy* copy_region,
const TextureInfo& src) { const TextureInfo& src) {
void* host_address = memory_->TranslatePhysical(src.guest_address); void* host_address = memory_->TranslatePhysical(src.guest_address);
if (src.texture_format == TextureFormat::k_CTX1) { if (!src.is_tiled) {
if (!src.is_tiled) { uint32_t offset_x, offset_y;
assert_always(); if (src.has_packed_mips &&
} else { TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y)) {
// Untile image. uint32_t bytes_per_block = src.format_info()->block_width *
// We could do this in a shader to speed things up, as this is pretty src.format_info()->block_height *
// slow. src.format_info()->bits_per_pixel / 8;
// TODO(benvanik): optimize this inner loop (or work by tiles).
const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address); const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
const uint32_t bytes_per_block = 8; src_mem += offset_y * src.size_2d.input_pitch;
src_mem += offset_x * bytes_per_block;
// Tiled textures can be packed; get the offset into the packed texture. for (uint32_t y = 0;
uint32_t offset_x; y < std::min(src.size_2d.block_height, src.size_2d.logical_height);
uint32_t offset_y; y++) {
TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); TextureSwap(src.endianness, dest, src_mem, src.size_2d.input_pitch);
auto log2_bpp = (bytes_per_block >> 2) + src_mem += src.size_2d.input_pitch;
((bytes_per_block >> 1) >> (bytes_per_block >> 2)); dest += src.size_2d.input_pitch;
uint32_t output_pitch = src.size_2d.input_width * 2;
// Offset to the current row, in bytes.
uint32_t output_row_offset = 0;
for (uint32_t y = 0; y < src.size_2d.block_height; y++) {
auto input_row_offset = TextureInfo::TiledOffset2DOuter(
offset_y + y, src.size_2d.block_width, log2_bpp);
// Go block-by-block on this row.
uint32_t output_offset = output_row_offset;
for (uint32_t x = 0; x < src.size_2d.block_width;
x++, output_offset += 8) {
auto input_offset =
TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y,
log2_bpp, input_row_offset) >>
log2_bpp;
// http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
union {
uint8_t data[8];
struct {
uint8_t r0, g0, r1, g1;
uint32_t xx;
};
} block;
static_assert(sizeof(block) == 8, "CTX1 block mismatch");
TextureSwap(src.endianness, block.data,
src_mem + input_offset * bytes_per_block,
bytes_per_block);
uint8_t cr[4] = {
block.r0, block.r1,
static_cast<uint8_t>(2.f / 3.f * block.r0 + 1.f / 3.f * block.r1),
static_cast<uint8_t>(1.f / 3.f * block.r0 +
2.f / 3.f * block.r1)};
uint8_t cg[4] = {
block.g0, block.g1,
static_cast<uint8_t>(2.f / 3.f * block.g0 + 1.f / 3.f * block.g1),
static_cast<uint8_t>(1.f / 3.f * block.g0 +
2.f / 3.f * block.g1)};
for (uint32_t oy = 0; oy < 4; ++oy) {
for (uint32_t ox = 0; ox < 4; ++ox) {
uint8_t xx = (block.xx >> (((ox + (oy * 4)) * 2))) & 3;
dest[output_offset + (oy * output_pitch) + (ox * 2) + 0] = cr[xx];
dest[output_offset + (oy * output_pitch) + (ox * 2) + 1] = cg[xx];
}
}
}
output_row_offset += output_pitch * 4;
} }
copy_region->bufferRowLength = src.size_2d.input_width;
#if 0 copy_region->bufferImageHeight = src.size_2d.input_height;
static int dds_counter = 0; copy_region->imageExtent = {src.size_2d.logical_width,
uint8_t dds_header[] = { src.size_2d.logical_height, 1};
0x44, 0x44, 0x53, 0x20, 0x7C, 0x00, 0x00, 0x00, 0x07, 0x10, 0x00, return true;
0x00, 0x58, 0x02, 0x00, 0x00, 0x20, 0x03, 0x00, 0x00, 0x00, 0x00, } else {
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Fast path copy entire image.
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, TextureSwap(src.endianness, dest, host_address, src.input_length);
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20,
0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x20, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00,
0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
*((uint32_t*)(&dds_header[12])) = src.size_2d.input_height;
*((uint32_t*)(&dds_header[16])) = src.size_2d.input_width;
char dds_name[512];
sprintf(dds_name, "TEST_CTX1_%u.dds", ++dds_counter);
auto handle = fopen(dds_name, "wb");
fwrite(dds_header, sizeof(dds_header), 1, handle);
uint8_t dummy[2] = {0, 0};
for (uint32_t i = 0;
i < src.size_2d.input_width * src.size_2d.input_height * 2; i += 2) {
fwrite(&dest[i], 2, 1, handle);
fwrite(dummy, 2, 1, handle);
}
fclose(handle);
#endif
copy_region->bufferRowLength = src.size_2d.input_width; copy_region->bufferRowLength = src.size_2d.input_width;
copy_region->bufferImageHeight = src.size_2d.input_height; copy_region->bufferImageHeight = src.size_2d.input_height;
copy_region->imageExtent = {src.size_2d.logical_width, copy_region->imageExtent = {src.size_2d.logical_width,
@ -956,87 +920,70 @@ bool TextureCache::ConvertTexture2D(uint8_t* dest,
return true; return true;
} }
} else { } else {
if (!src.is_tiled) { // Untile image.
uint32_t offset_x, offset_y; // We could do this in a shader to speed things up, as this is pretty
if (src.has_packed_mips && // slow.
TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y)) {
uint32_t bytes_per_block = src.format_info()->block_width *
src.format_info()->block_height *
src.format_info()->bits_per_pixel / 8;
const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address); // TODO(benvanik): optimize this inner loop (or work by tiles).
src_mem += offset_y * src.size_2d.input_pitch; const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
src_mem += offset_x * bytes_per_block; uint32_t bytes_per_block = src.format_info()->block_width *
for (uint32_t y = 0; src.format_info()->block_height *
y < std::min(src.size_2d.block_height, src.size_2d.logical_height); src.format_info()->bits_per_pixel / 8;
y++) {
TextureSwap(src.endianness, dest, src_mem, src.size_2d.input_pitch);
src_mem += src.size_2d.input_pitch;
dest += src.size_2d.input_pitch;
}
copy_region->bufferRowLength = src.size_2d.input_width;
copy_region->bufferImageHeight = src.size_2d.input_height;
copy_region->imageExtent = {src.size_2d.logical_width,
src.size_2d.logical_height, 1};
return true;
} else {
// Fast path copy entire image.
TextureSwap(src.endianness, dest, host_address, src.input_length);
copy_region->bufferRowLength = src.size_2d.input_width;
copy_region->bufferImageHeight = src.size_2d.input_height;
copy_region->imageExtent = {src.size_2d.logical_width,
src.size_2d.logical_height, 1};
return true;
}
} else {
// Untile image.
// We could do this in a shader to speed things up, as this is pretty
// slow.
// TODO(benvanik): optimize this inner loop (or work by tiles). uint32_t output_pitch = src.size_2d.input_width *
const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address); src.format_info()->block_width *
uint32_t bytes_per_block = src.format_info()->block_width * src.format_info()->bits_per_pixel / 8;
src.format_info()->block_height *
src.format_info()->bits_per_pixel / 8;
// Tiled textures can be packed; get the offset into the packed texture. uint32_t output_row_height = 1;
uint32_t offset_x; if (src.texture_format == TextureFormat::k_CTX1) {
uint32_t offset_y; // TODO: Can we calculate this?
TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); output_row_height = 4;
auto log2_bpp = (bytes_per_block >> 2) + }
((bytes_per_block >> 1) >> (bytes_per_block >> 2));
// Offset to the current row, in bytes. // Tiled textures can be packed; get the offset into the packed texture.
uint32_t output_row_offset = 0; uint32_t offset_x;
for (uint32_t y = 0; y < src.size_2d.block_height; y++) { uint32_t offset_y;
auto input_row_offset = TextureInfo::TiledOffset2DOuter( TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
offset_y + y, src.size_2d.block_width, log2_bpp); auto log2_bpp = (bytes_per_block >> 2) +
((bytes_per_block >> 1) >> (bytes_per_block >> 2));
// Go block-by-block on this row. // Offset to the current row, in bytes.
uint32_t output_offset = output_row_offset; uint32_t output_row_offset = 0;
for (uint32_t x = 0; x < src.size_2d.block_width; x++) { for (uint32_t y = 0; y < src.size_2d.block_height; y++) {
auto input_offset = auto input_row_offset = TextureInfo::TiledOffset2DOuter(
TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, offset_y + y, src.size_2d.block_width, log2_bpp);
log2_bpp, input_row_offset) >>
log2_bpp;
// Go block-by-block on this row.
uint32_t output_offset = output_row_offset;
for (uint32_t x = 0; x < src.size_2d.block_width; x++) {
auto input_offset = TextureInfo::TiledOffset2DInner(
offset_x + x, offset_y + y, log2_bpp, input_row_offset);
input_offset >>= log2_bpp;
if (src.texture_format == TextureFormat::k_CTX1) {
// Convert to R8G8.
ConvertTexelCTX1(&dest[output_offset], output_pitch, src_mem,
src.endianness);
} else {
// Generic swap to destination.
TextureSwap(src.endianness, dest + output_offset, TextureSwap(src.endianness, dest + output_offset,
src_mem + input_offset * bytes_per_block, src_mem + input_offset * bytes_per_block,
bytes_per_block); bytes_per_block);
output_offset += bytes_per_block;
} }
output_row_offset += src.size_2d.input_pitch; output_offset += bytes_per_block;
} }
copy_region->bufferRowLength = src.size_2d.input_width; output_row_offset += output_pitch * output_row_height;
copy_region->bufferImageHeight = src.size_2d.input_height;
copy_region->imageExtent = {src.size_2d.logical_width,
src.size_2d.logical_height, 1};
return true;
} }
copy_region->bufferRowLength = src.size_2d.input_width;
copy_region->bufferImageHeight = src.size_2d.input_height;
copy_region->imageExtent = {src.size_2d.logical_width,
src.size_2d.logical_height, 1};
return true;
} }
return false; return false;
} }
@ -1044,60 +991,56 @@ bool TextureCache::ConvertTextureCube(uint8_t* dest,
VkBufferImageCopy* copy_region, VkBufferImageCopy* copy_region,
const TextureInfo& src) { const TextureInfo& src) {
void* host_address = memory_->TranslatePhysical(src.guest_address); void* host_address = memory_->TranslatePhysical(src.guest_address);
if (src.texture_format == TextureFormat::k_CTX1) { if (!src.is_tiled) {
assert_always(); // Fast path copy entire image.
TextureSwap(src.endianness, dest, host_address, src.input_length);
copy_region->bufferRowLength = src.size_cube.input_width;
copy_region->bufferImageHeight = src.size_cube.input_height;
copy_region->imageExtent = {src.size_cube.logical_width,
src.size_cube.logical_height, 6};
return true;
} else { } else {
if (!src.is_tiled) { // TODO(benvanik): optimize this inner loop (or work by tiles).
// Fast path copy entire image. const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address);
TextureSwap(src.endianness, dest, host_address, src.input_length); uint32_t bytes_per_block = src.format_info()->block_width *
copy_region->bufferRowLength = src.size_cube.input_width; src.format_info()->block_height *
copy_region->bufferImageHeight = src.size_cube.input_height; src.format_info()->bits_per_pixel / 8;
copy_region->imageExtent = {src.size_cube.logical_width, // Tiled textures can be packed; get the offset into the packed texture.
src.size_cube.logical_height, 6}; uint32_t offset_x;
return true; uint32_t offset_y;
} else { TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y);
// TODO(benvanik): optimize this inner loop (or work by tiles). auto bpp = (bytes_per_block >> 2) +
const uint8_t* src_mem = reinterpret_cast<const uint8_t*>(host_address); ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
uint32_t bytes_per_block = src.format_info()->block_width * for (int face = 0; face < 6; ++face) {
src.format_info()->block_height * for (uint32_t y = 0, output_base_offset = 0;
src.format_info()->bits_per_pixel / 8; y < src.size_cube.block_height;
// Tiled textures can be packed; get the offset into the packed texture. y++, output_base_offset += src.size_cube.input_pitch) {
uint32_t offset_x; auto input_base_offset = TextureInfo::TiledOffset2DOuter(
uint32_t offset_y; offset_y + y,
TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); (src.size_cube.input_width / src.format_info()->block_width), bpp);
auto bpp = (bytes_per_block >> 2) + for (uint32_t x = 0, output_offset = output_base_offset;
((bytes_per_block >> 1) >> (bytes_per_block >> 2)); x < src.size_cube.block_width;
for (int face = 0; face < 6; ++face) { x++, output_offset += bytes_per_block) {
for (uint32_t y = 0, output_base_offset = 0; auto input_offset =
y < src.size_cube.block_height; TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp,
y++, output_base_offset += src.size_cube.input_pitch) { input_base_offset) >>
auto input_base_offset = TextureInfo::TiledOffset2DOuter( bpp;
offset_y + y, TextureSwap(src.endianness, dest + output_offset,
(src.size_cube.input_width / src.format_info()->block_width), src_mem + input_offset * bytes_per_block,
bpp); bytes_per_block);
for (uint32_t x = 0, output_offset = output_base_offset;
x < src.size_cube.block_width;
x++, output_offset += bytes_per_block) {
auto input_offset =
TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp,
input_base_offset) >>
bpp;
TextureSwap(src.endianness, dest + output_offset,
src_mem + input_offset * bytes_per_block,
bytes_per_block);
}
} }
src_mem += src.size_cube.input_face_length;
dest += src.size_cube.input_face_length;
} }
src_mem += src.size_cube.input_face_length;
copy_region->bufferRowLength = src.size_cube.input_width; dest += src.size_cube.input_face_length;
copy_region->bufferImageHeight = src.size_cube.input_height;
copy_region->imageExtent = {src.size_cube.logical_width,
src.size_cube.logical_height, 6};
return true;
} }
copy_region->bufferRowLength = src.size_cube.input_width;
copy_region->bufferImageHeight = src.size_cube.input_height;
copy_region->imageExtent = {src.size_cube.logical_width,
src.size_cube.logical_height, 6};
return true;
} }
return false; return false;
} }

View File

@ -140,6 +140,9 @@ class TextureCache {
void FlushPendingCommands(VkCommandBuffer command_buffer, void FlushPendingCommands(VkCommandBuffer command_buffer,
VkFence completion_fence); VkFence completion_fence);
static void ConvertTexelCTX1(uint8_t* dest, size_t dest_pitch,
const uint8_t* src, Endian src_endianness);
bool ConvertTexture2D(uint8_t* dest, VkBufferImageCopy* copy_region, bool ConvertTexture2D(uint8_t* dest, VkBufferImageCopy* copy_region,
const TextureInfo& src); const TextureInfo& src);
bool ConvertTextureCube(uint8_t* dest, VkBufferImageCopy* copy_region, bool ConvertTextureCube(uint8_t* dest, VkBufferImageCopy* copy_region,