diff --git a/src/xenia/gpu/gl4/texture_cache.cc b/src/xenia/gpu/gl4/texture_cache.cc index e1100a997..51de9118f 100644 --- a/src/xenia/gpu/gl4/texture_cache.cc +++ b/src/xenia/gpu/gl4/texture_cache.cc @@ -689,6 +689,82 @@ void TextureCache::EvictTexture(TextureEntry* entry) { delete entry; } +struct HostTextureInfo { + uint32_t output_length; + + union { + struct { + uint32_t output_width; + uint32_t output_pitch; + } size_1d; + struct { + uint32_t output_width; + uint32_t output_height; + uint32_t output_pitch; + } size_2d; + struct { + } size_3d; + struct { + uint32_t output_width; + uint32_t output_height; + uint32_t output_pitch; + uint32_t output_face_length; + } size_cube; + }; + + static bool Setup(const TextureInfo& guest_info, HostTextureInfo* out_info) { + auto& info = *out_info; + auto format = guest_info.format_info(); + + uint32_t bytes_per_block = format->block_width * format->bits_per_pixel / 8; + + switch (guest_info.dimension) { + case Dimension::k1D: { + uint32_t block_width = xe::round_up(guest_info.size_1d.logical_width, + format->block_width) / + format->block_width; + info.size_1d.output_width = block_width * format->block_width; + info.size_1d.output_pitch = block_width * bytes_per_block; + info.output_length = info.size_1d.output_pitch; + return true; + } + case Dimension::k2D: { + uint32_t block_width = xe::round_up(guest_info.size_2d.logical_width, + format->block_width) / + format->block_width; + uint32_t block_height = xe::round_up(guest_info.size_2d.logical_height, + format->block_height) / + format->block_height; + info.size_2d.output_width = block_width * format->block_width; + info.size_2d.output_height = block_height * format->block_height; + info.size_2d.output_pitch = block_width * bytes_per_block; + info.output_length = info.size_2d.output_pitch * block_height; + return true; + }; + case Dimension::k3D: { + return false; + } + case Dimension::kCube: { + uint32_t block_width = xe::round_up(guest_info.size_cube.logical_width, + format->block_width) / + format->block_width; + uint32_t block_height = + xe::round_up(guest_info.size_cube.logical_height, + format->block_height) / + format->block_height; + info.size_cube.output_width = block_width * format->block_width; + info.size_cube.output_height = block_height * format->block_height; + info.size_cube.output_pitch = block_width * bytes_per_block; + info.size_cube.output_face_length = + info.size_cube.output_pitch * block_height; + info.output_length = info.size_cube.output_face_length * 6; + return true; + } + } + return false; + } +}; + void TextureSwap(Endian endianness, void* dest, const void* src, size_t length) { switch (endianness) { @@ -720,14 +796,20 @@ bool TextureCache::UploadTexture1D(GLuint texture, return false; } - size_t unpack_length = texture_info.output_length; + HostTextureInfo host_info; + if (!HostTextureInfo::Setup(texture_info, &host_info)) { + assert_always("Failed to set up host texture info"); + return false; + } + + size_t unpack_length = host_info.output_length; glTextureStorage1D(texture, 1, config.internal_format, - texture_info.size_1d.output_width); + host_info.size_1d.output_width); auto allocation = scratch_buffer_->Acquire(unpack_length); if (!texture_info.is_tiled) { - if (texture_info.size_1d.input_pitch == texture_info.size_1d.output_pitch) { + if (texture_info.size_1d.input_pitch == host_info.size_1d.output_pitch) { TextureSwap(texture_info.endianness, allocation.host_ptr, host_address, unpack_length); } else { @@ -744,10 +826,10 @@ bool TextureCache::UploadTexture1D(GLuint texture, glBindBuffer(GL_PIXEL_UNPACK_BUFFER, scratch_buffer_->handle()); if (texture_info.is_compressed()) { - glCompressedTextureSubImage1D( - texture, 0, 0, texture_info.size_1d.output_width, config.format, - static_cast(unpack_length), - reinterpret_cast(unpack_offset)); + glCompressedTextureSubImage1D(texture, 0, 0, host_info.size_1d.output_width, + config.format, + static_cast(unpack_length), + reinterpret_cast(unpack_offset)); } else { // Most of these don't seem to have an effect on compressed images. // glPixelStorei(GL_UNPACK_SWAP_BYTES, GL_TRUE); @@ -755,7 +837,7 @@ bool TextureCache::UploadTexture1D(GLuint texture, // glPixelStorei(GL_UNPACK_ROW_LENGTH, texture_info.size_2d.input_width); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - glTextureSubImage1D(texture, 0, 0, texture_info.size_1d.output_width, + glTextureSubImage1D(texture, 0, 0, host_info.size_1d.output_width, config.format, config.type, reinterpret_cast(unpack_offset)); } @@ -776,10 +858,16 @@ bool TextureCache::UploadTexture2D(GLuint texture, return false; } - size_t unpack_length = texture_info.output_length; + HostTextureInfo host_info; + if (!HostTextureInfo::Setup(texture_info, &host_info)) { + assert_always("Failed to set up host texture info"); + return false; + } + + size_t unpack_length = host_info.output_length; glTextureStorage2D(texture, 1, config.internal_format, - texture_info.size_2d.output_width, - texture_info.size_2d.output_height); + host_info.size_2d.output_width, + host_info.size_2d.output_height); auto allocation = scratch_buffer_->Acquire(unpack_length); @@ -796,16 +884,16 @@ bool TextureCache::UploadTexture2D(GLuint texture, src += offset_x * bytes_per_block; uint8_t* dest = reinterpret_cast(allocation.host_ptr); uint32_t pitch = std::min(texture_info.size_2d.input_pitch, - texture_info.size_2d.output_pitch); + host_info.size_2d.output_pitch); for (uint32_t y = 0; y < std::min(texture_info.size_2d.block_height, texture_info.size_2d.logical_height); y++) { TextureSwap(texture_info.endianness, dest, src, pitch); src += texture_info.size_2d.input_pitch; - dest += texture_info.size_2d.output_pitch; + dest += host_info.size_2d.output_pitch; } } else if (texture_info.size_2d.input_pitch == - texture_info.size_2d.output_pitch) { + host_info.size_2d.output_pitch) { // Fast path copy entire image. TextureSwap(texture_info.endianness, allocation.host_ptr, host_address, unpack_length); @@ -816,13 +904,13 @@ bool TextureCache::UploadTexture2D(GLuint texture, const uint8_t* src = host_address; uint8_t* dest = reinterpret_cast(allocation.host_ptr); uint32_t pitch = std::min(texture_info.size_2d.input_pitch, - texture_info.size_2d.output_pitch); + host_info.size_2d.output_pitch); for (uint32_t y = 0; y < std::min(texture_info.size_2d.block_height, texture_info.size_2d.logical_height); y++) { TextureSwap(texture_info.endianness, dest, src, pitch); src += texture_info.size_2d.input_pitch; - dest += texture_info.size_2d.output_pitch; + dest += host_info.size_2d.output_pitch; } } } else { @@ -846,7 +934,7 @@ bool TextureCache::UploadTexture2D(GLuint texture, for (uint32_t y = 0, output_base_offset = 0; y < std::min(texture_info.size_2d.block_height, texture_info.size_2d.logical_height); - y++, output_base_offset += texture_info.size_2d.output_pitch) { + y++, output_base_offset += host_info.size_2d.output_pitch) { auto input_base_offset = TextureInfo::TiledOffset2DOuter( offset_y + y, (texture_info.size_2d.input_width / texture_info.format_info()->block_width), @@ -872,8 +960,8 @@ bool TextureCache::UploadTexture2D(GLuint texture, glBindBuffer(GL_PIXEL_UNPACK_BUFFER, scratch_buffer_->handle()); if (texture_info.is_compressed()) { glCompressedTextureSubImage2D( - texture, 0, 0, 0, texture_info.size_2d.output_width, - texture_info.size_2d.output_height, config.format, + texture, 0, 0, 0, host_info.size_2d.output_width, + host_info.size_2d.output_height, config.format, static_cast(unpack_length), reinterpret_cast(unpack_offset)); } else { @@ -883,8 +971,8 @@ bool TextureCache::UploadTexture2D(GLuint texture, // glPixelStorei(GL_UNPACK_ROW_LENGTH, texture_info.size_2d.input_width); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - glTextureSubImage2D(texture, 0, 0, 0, texture_info.size_2d.output_width, - texture_info.size_2d.output_height, config.format, + glTextureSubImage2D(texture, 0, 0, 0, host_info.size_2d.output_width, + host_info.size_2d.output_height, config.format, config.type, reinterpret_cast(unpack_offset)); } glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); @@ -904,15 +992,21 @@ bool TextureCache::UploadTextureCube(GLuint texture, return false; } - size_t unpack_length = texture_info.output_length; + HostTextureInfo host_info; + if (!HostTextureInfo::Setup(texture_info, &host_info)) { + assert_always("Failed to set up host texture info"); + return false; + } + + size_t unpack_length = host_info.output_length; glTextureStorage2D(texture, 1, config.internal_format, - texture_info.size_cube.output_width, - texture_info.size_cube.output_height); + host_info.size_cube.output_width, + host_info.size_cube.output_height); auto allocation = scratch_buffer_->Acquire(unpack_length); if (!texture_info.is_tiled) { if (texture_info.size_cube.input_pitch == - texture_info.size_cube.output_pitch) { + host_info.size_cube.output_pitch) { // Fast path copy entire image. TextureSwap(texture_info.endianness, allocation.host_ptr, host_address, unpack_length); @@ -924,11 +1018,11 @@ bool TextureCache::UploadTextureCube(GLuint texture, uint8_t* dest = reinterpret_cast(allocation.host_ptr); for (int face = 0; face < 6; ++face) { uint32_t pitch = std::min(texture_info.size_cube.input_pitch, - texture_info.size_cube.output_pitch); + host_info.size_cube.output_pitch); for (uint32_t y = 0; y < texture_info.size_cube.block_height; y++) { TextureSwap(texture_info.endianness, dest, src, pitch); src += texture_info.size_cube.input_pitch; - dest += texture_info.size_cube.output_pitch; + dest += host_info.size_cube.output_pitch; } } } @@ -948,7 +1042,7 @@ bool TextureCache::UploadTextureCube(GLuint texture, for (int face = 0; face < 6; ++face) { for (uint32_t y = 0, output_base_offset = 0; y < texture_info.size_cube.block_height; - y++, output_base_offset += texture_info.size_cube.output_pitch) { + y++, output_base_offset += host_info.size_cube.output_pitch) { auto input_base_offset = TextureInfo::TiledOffset2DOuter( offset_y + y, (texture_info.size_cube.input_width / texture_info.format_info()->block_width), @@ -965,7 +1059,7 @@ bool TextureCache::UploadTextureCube(GLuint texture, } } src += texture_info.size_cube.input_face_length; - dest += texture_info.size_cube.output_face_length; + dest += host_info.size_cube.output_face_length; } } size_t unpack_offset = allocation.offset; @@ -977,8 +1071,8 @@ bool TextureCache::UploadTextureCube(GLuint texture, glBindBuffer(GL_PIXEL_UNPACK_BUFFER, scratch_buffer_->handle()); if (texture_info.is_compressed()) { glCompressedTextureSubImage3D( - texture, 0, 0, 0, 0, texture_info.size_cube.output_width, - texture_info.size_cube.output_height, 6, config.format, + texture, 0, 0, 0, 0, host_info.size_cube.output_width, + host_info.size_cube.output_height, 6, config.format, static_cast(unpack_length), reinterpret_cast(unpack_offset)); } else { @@ -988,9 +1082,8 @@ bool TextureCache::UploadTextureCube(GLuint texture, // glPixelStorei(GL_UNPACK_ROW_LENGTH, texture_info.size_2d.input_width); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - glTextureSubImage3D(texture, 0, 0, 0, 0, - texture_info.size_cube.output_width, - texture_info.size_cube.output_height, 6, config.format, + glTextureSubImage3D(texture, 0, 0, 0, 0, host_info.size_cube.output_width, + host_info.size_cube.output_height, 6, config.format, config.type, reinterpret_cast(unpack_offset)); } glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); diff --git a/src/xenia/gpu/texture_info.cc b/src/xenia/gpu/texture_info.cc index df83a1292..1ae30591e 100644 --- a/src/xenia/gpu/texture_info.cc +++ b/src/xenia/gpu/texture_info.cc @@ -56,7 +56,6 @@ bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch, info.is_tiled = fetch.tiled; info.has_packed_mips = fetch.packed_mips; info.input_length = 0; // Populated below. - info.output_length = 0; if (info.format_info()->format == TextureFormat::kUnknown) { assert_true("Unsupported texture format"); @@ -71,15 +70,6 @@ bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch, case Dimension::k2D: { info.CalculateTextureSizes2D(fetch.size_2d.width + 1, fetch.size_2d.height + 1); - - // DEBUG: Make sure our calculated pitch is equal to the fetch pitch. - uint32_t bytes_per_block = info.format_info()->block_width * - info.format_info()->block_height * - info.format_info()->bits_per_pixel / 8; - - assert_true(info.size_2d.input_pitch == - (bytes_per_block * fetch.pitch << 5) / - info.format_info()->block_width); } break; case Dimension::k3D: { // TODO(benvanik): calculate size. @@ -110,7 +100,6 @@ bool TextureInfo::PrepareResolve(uint32_t physical_address, info.is_tiled = true; info.has_packed_mips = false; info.input_length = 0; - info.output_length = 0; if (info.format_info()->format == TextureFormat::kUnknown) { assert_true("Unsupported texture format"); @@ -145,11 +134,6 @@ void TextureInfo::CalculateTextureSizes1D(uint32_t width) { size_1d.input_width = tile_width * 32 * format->block_width; size_1d.input_pitch = byte_pitch; input_length = size_1d.input_pitch; - - // TODO(DrChat): Remove this, leave it up to the backend. - size_1d.output_width = block_width * format->block_width; - size_1d.output_pitch = block_width * bytes_per_block; - output_length = size_1d.output_pitch; } void TextureInfo::CalculateTextureSizes2D(uint32_t width, uint32_t height) { @@ -190,13 +174,6 @@ void TextureInfo::CalculateTextureSizes2D(uint32_t width, uint32_t height) { size_2d.input_pitch = byte_pitch; input_length = size_2d.input_pitch * size_2d.block_height; - - // TODO(DrChat): Remove this, leave it up to the backend. - size_2d.output_width = block_width * format->block_width; - size_2d.output_height = block_height * format->block_height; - size_2d.output_pitch = block_width * bytes_per_block; - - output_length = size_2d.output_pitch * block_height; } void TextureInfo::CalculateTextureSizesCube(uint32_t width, uint32_t height, @@ -235,14 +212,6 @@ void TextureInfo::CalculateTextureSizesCube(uint32_t width, uint32_t height, size_cube.input_face_length = size_cube.input_pitch * size_cube.block_height; input_length = size_cube.input_face_length * 6; - - // TODO(DrChat): Remove this, leave it up to the backend. - size_cube.output_width = block_width * format->block_width; - size_cube.output_height = block_height * format->block_height; - size_cube.output_pitch = block_width * bytes_per_block; - - size_cube.output_face_length = size_cube.output_pitch * block_height; - output_length = size_cube.output_face_length * 6; } bool TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info, diff --git a/src/xenia/gpu/texture_info.h b/src/xenia/gpu/texture_info.h index 804496057..691680a81 100644 --- a/src/xenia/gpu/texture_info.h +++ b/src/xenia/gpu/texture_info.h @@ -256,7 +256,6 @@ struct TextureInfo { bool is_tiled; bool has_packed_mips; uint32_t input_length; - uint32_t output_length; const FormatInfo* format_info() const { return FormatInfo::Get(static_cast(texture_format)); @@ -272,10 +271,6 @@ struct TextureInfo { uint32_t block_width; // # of horizontal blocks uint32_t input_width; // pixel pitch uint32_t input_pitch; // pitch in bytes - - // DEPRECATED: Do not use. - uint32_t output_width; - uint32_t output_pitch; } size_1d; struct { uint32_t logical_width; @@ -285,11 +280,6 @@ struct TextureInfo { uint32_t input_width; // pixel pitch uint32_t input_height; // pixel height uint32_t input_pitch; // pitch in bytes - - // DEPRECATED: Do not use. - uint32_t output_width; - uint32_t output_height; - uint32_t output_pitch; } size_2d; struct { } size_3d; @@ -302,12 +292,6 @@ struct TextureInfo { uint32_t input_height; // pixel height uint32_t input_pitch; // pitch in bytes uint32_t input_face_length; // pitch of face in bytes - - // DEPRECATED: Do not use. - uint32_t output_width; - uint32_t output_height; - uint32_t output_pitch; - uint32_t output_face_length; } size_cube; }; diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc index ebffeb10f..80783487c 100644 --- a/src/xenia/gpu/vulkan/texture_cache.cc +++ b/src/xenia/gpu/vulkan/texture_cache.cc @@ -98,7 +98,7 @@ static const TextureConfig texture_configs[64] = { {TextureFormat::k_32_32_32_FLOAT, VK_FORMAT_R32G32B32_SFLOAT}, {TextureFormat::k_DXT3A, VK_FORMAT_UNDEFINED}, {TextureFormat::k_DXT5A, VK_FORMAT_UNDEFINED}, - {TextureFormat::k_CTX1, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_CTX1, VK_FORMAT_R8G8_UINT}, {TextureFormat::k_DXT3A_AS_1_1_1_1, VK_FORMAT_UNDEFINED}, {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED}, {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED}, @@ -545,29 +545,7 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info, trace_writer_->WriteMemoryRead(texture_info.guest_address, texture_info.input_length); - bool uploaded = false; - switch (texture_info.dimension) { - case Dimension::k1D: { - uploaded = UploadTexture1D(command_buffer, completion_fence, texture, - texture_info); - } break; - - case Dimension::k2D: { - uploaded = UploadTexture2D(command_buffer, completion_fence, texture, - texture_info); - } break; - - case Dimension::kCube: { - uploaded = UploadTextureCube(command_buffer, completion_fence, texture, - texture_info); - } break; - - default: - assert_unhandled_case(texture_info.dimension); - break; - } - - if (!uploaded) { + if (!UploadTexture(command_buffer, completion_fence, texture, texture_info)) { FreeTexture(texture); return nullptr; } @@ -578,7 +556,7 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info, VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT, xe::format_string( "0x%.8X - 0x%.8X", texture_info.guest_address, - texture_info.guest_address + texture_info.output_length)); + texture_info.guest_address + texture_info.input_length)); // Okay. Now that the texture is uploaded from system memory, put a writewatch // on it to tell us if it's been modified from the guest. @@ -912,169 +890,230 @@ void TextureCache::FlushPendingCommands(VkCommandBuffer command_buffer, vkBeginCommandBuffer(command_buffer, &begin_info); } -void TextureCache::ConvertTexture1D(uint8_t* dest, const TextureInfo& src) { +bool TextureCache::ConvertTexture1D(uint8_t* dest, + VkBufferImageCopy* copy_region, + const TextureInfo& src) { void* host_address = memory_->TranslatePhysical(src.guest_address); - if (!src.is_tiled) { - if (src.size_1d.input_pitch == src.size_1d.output_pitch) { - TextureSwap(src.endianness, dest, host_address, src.output_length); + if (src.texture_format == TextureFormat::k_CTX1) { + assert_always(); + } else { + if (!src.is_tiled) { + TextureSwap(src.endianness, dest, host_address, src.input_length); + copy_region->bufferRowLength = src.size_1d.input_width; + copy_region->bufferImageHeight = 1; + copy_region->imageExtent = {src.size_1d.logical_width, 1, 1}; + return true; } else { assert_always(); } - } else { - assert_always(); } + return false; } -void TextureCache::ConvertTexture2D(uint8_t* dest, const TextureInfo& src) { +bool TextureCache::ConvertTexture2D(uint8_t* dest, + VkBufferImageCopy* copy_region, + const TextureInfo& src) { void* host_address = memory_->TranslatePhysical(src.guest_address); - if (!src.is_tiled) { - uint32_t offset_x, offset_y; - if (src.has_packed_mips && - TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y)) { + if (src.texture_format == TextureFormat::k_CTX1) { + assert_always(); + } else { + if (!src.is_tiled) { + uint32_t offset_x, offset_y; + if (src.has_packed_mips && + TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y)) { + uint32_t bytes_per_block = src.format_info()->block_width * + src.format_info()->block_height * + src.format_info()->bits_per_pixel / 8; + + const uint8_t* src_mem = reinterpret_cast(host_address); + src_mem += offset_y * src.size_2d.input_pitch; + src_mem += offset_x * bytes_per_block; + for (uint32_t y = 0; + y < std::min(src.size_2d.block_height, src.size_2d.logical_height); + y++) { + TextureSwap(src.endianness, dest, src_mem, src.size_2d.input_pitch); + src_mem += src.size_2d.input_pitch; + dest += src.size_2d.input_pitch; + } + copy_region->bufferRowLength = src.size_2d.input_width; + copy_region->bufferImageHeight = src.size_2d.input_height; + copy_region->imageExtent = {src.size_2d.logical_width, + src.size_2d.logical_height, 1}; + return true; + } else { + // Fast path copy entire image. + TextureSwap(src.endianness, dest, host_address, src.input_length); + copy_region->bufferRowLength = src.size_2d.input_width; + copy_region->bufferImageHeight = src.size_2d.input_height; + copy_region->imageExtent = {src.size_2d.logical_width, + src.size_2d.logical_height, 1}; + return true; + } + } else { + // Untile image. + // We could do this in a shader to speed things up, as this is pretty + // slow. + + // TODO(benvanik): optimize this inner loop (or work by tiles). + const uint8_t* src_mem = reinterpret_cast(host_address); uint32_t bytes_per_block = src.format_info()->block_width * src.format_info()->block_height * src.format_info()->bits_per_pixel / 8; - const uint8_t* src_mem = reinterpret_cast(host_address); - src_mem += offset_y * src.size_2d.input_pitch; - src_mem += offset_x * bytes_per_block; - uint32_t pitch = - std::min(src.size_2d.input_pitch, src.size_2d.output_pitch); - for (uint32_t y = 0; - y < std::min(src.size_2d.block_height, src.size_2d.logical_height); - y++) { - TextureSwap(src.endianness, dest, src_mem, pitch); - src_mem += src.size_2d.input_pitch; - dest += src.size_2d.output_pitch; - } - } else if (src.size_2d.input_pitch == src.size_2d.output_pitch) { - // Fast path copy entire image. - TextureSwap(src.endianness, dest, host_address, src.output_length); - } else { - // Slow path copy row-by-row because strides differ. - // UNPACK_ROW_LENGTH only works for uncompressed images, and likely does - // this exact thing under the covers, so we just always do it here. - const uint8_t* src_mem = reinterpret_cast(host_address); - uint32_t pitch = - std::min(src.size_2d.input_pitch, src.size_2d.output_pitch); - for (uint32_t y = 0; - y < std::min(src.size_2d.block_height, src.size_2d.logical_height); - y++) { - TextureSwap(src.endianness, dest, src_mem, pitch); - src_mem += src.size_2d.input_pitch; - dest += src.size_2d.output_pitch; - } - } - } else { - // Untile image. - // We could do this in a shader to speed things up, as this is pretty slow. + // Tiled textures can be packed; get the offset into the packed texture. + uint32_t offset_x; + uint32_t offset_y; + TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); + auto log2_bpp = (bytes_per_block >> 2) + + ((bytes_per_block >> 1) >> (bytes_per_block >> 2)); - // TODO(benvanik): optimize this inner loop (or work by tiles). - const uint8_t* src_mem = reinterpret_cast(host_address); - uint32_t bytes_per_block = src.format_info()->block_width * - src.format_info()->block_height * - src.format_info()->bits_per_pixel / 8; + // Offset to the current row, in bytes. + uint32_t output_row_offset = 0; + for (uint32_t y = 0; y < src.size_2d.block_height; y++) { + auto input_row_offset = TextureInfo::TiledOffset2DOuter( + offset_y + y, src.size_2d.block_width, log2_bpp); - // Tiled textures can be packed; get the offset into the packed texture. - uint32_t offset_x; - uint32_t offset_y; - TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); - auto log2_bpp = (bytes_per_block >> 2) + - ((bytes_per_block >> 1) >> (bytes_per_block >> 2)); - - // Offset to the current row, in bytes. - uint32_t output_row_offset = 0; - for (uint32_t y = 0; y < src.size_2d.block_height; y++) { - auto input_row_offset = TextureInfo::TiledOffset2DOuter( - offset_y + y, src.size_2d.block_width, log2_bpp); - - // Go block-by-block on this row. - uint32_t output_offset = output_row_offset; - for (uint32_t x = 0; x < src.size_2d.block_width; x++) { - auto input_offset = - TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, - log2_bpp, input_row_offset) >> - log2_bpp; - - TextureSwap(src.endianness, dest + output_offset, - src_mem + input_offset * bytes_per_block, bytes_per_block); - - output_offset += bytes_per_block; - } - - output_row_offset += src.size_2d.output_pitch; - } - } -} - -void TextureCache::ConvertTextureCube(uint8_t* dest, const TextureInfo& src) { - void* host_address = memory_->TranslatePhysical(src.guest_address); - if (!src.is_tiled) { - if (src.size_cube.input_pitch == src.size_cube.output_pitch) { - // Fast path copy entire image. - TextureSwap(src.endianness, dest, host_address, src.output_length); - } else { - // Slow path copy row-by-row because strides differ. - // UNPACK_ROW_LENGTH only works for uncompressed images, and likely does - // this exact thing under the covers, so we just always do it here. - const uint8_t* src_mem = reinterpret_cast(host_address); - for (int face = 0; face < 6; ++face) { - uint32_t pitch = - std::min(src.size_cube.input_pitch, src.size_cube.output_pitch); - for (uint32_t y = 0; y < src.size_cube.block_height; y++) { - TextureSwap(src.endianness, dest, src_mem, pitch); - src_mem += src.size_cube.input_pitch; - dest += src.size_cube.output_pitch; - } - } - } - } else { - // TODO(benvanik): optimize this inner loop (or work by tiles). - const uint8_t* src_mem = reinterpret_cast(host_address); - uint32_t bytes_per_block = src.format_info()->block_width * - src.format_info()->block_height * - src.format_info()->bits_per_pixel / 8; - // Tiled textures can be packed; get the offset into the packed texture. - uint32_t offset_x; - uint32_t offset_y; - TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); - auto bpp = (bytes_per_block >> 2) + - ((bytes_per_block >> 1) >> (bytes_per_block >> 2)); - for (int face = 0; face < 6; ++face) { - for (uint32_t y = 0, output_base_offset = 0; - y < src.size_cube.block_height; - y++, output_base_offset += src.size_cube.output_pitch) { - auto input_base_offset = TextureInfo::TiledOffset2DOuter( - offset_y + y, - (src.size_cube.input_width / src.format_info()->block_width), bpp); - for (uint32_t x = 0, output_offset = output_base_offset; - x < src.size_cube.block_width; - x++, output_offset += bytes_per_block) { + // Go block-by-block on this row. + uint32_t output_offset = output_row_offset; + for (uint32_t x = 0; x < src.size_2d.block_width; x++) { auto input_offset = - TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp, - input_base_offset) >> - bpp; + TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, + log2_bpp, input_row_offset) >> + log2_bpp; + TextureSwap(src.endianness, dest + output_offset, src_mem + input_offset * bytes_per_block, bytes_per_block); + + output_offset += bytes_per_block; } + + output_row_offset += src.size_2d.input_pitch; } - src_mem += src.size_cube.input_face_length; - dest += src.size_cube.output_face_length; + + copy_region->bufferRowLength = src.size_2d.input_width; + copy_region->bufferImageHeight = src.size_2d.input_height; + copy_region->imageExtent = {src.size_2d.logical_width, + src.size_2d.logical_height, 1}; + return true; } } + return false; +} + +bool TextureCache::ConvertTextureCube(uint8_t* dest, + VkBufferImageCopy* copy_region, + const TextureInfo& src) { + void* host_address = memory_->TranslatePhysical(src.guest_address); + if (src.texture_format == TextureFormat::k_CTX1) { + assert_always(); + } else { + if (!src.is_tiled) { + // Fast path copy entire image. + TextureSwap(src.endianness, dest, host_address, src.input_length); + copy_region->bufferRowLength = src.size_cube.input_width; + copy_region->bufferImageHeight = src.size_cube.input_height; + copy_region->imageExtent = {src.size_cube.logical_width, + src.size_cube.logical_height, 6}; + return true; + } else { + // TODO(benvanik): optimize this inner loop (or work by tiles). + const uint8_t* src_mem = reinterpret_cast(host_address); + uint32_t bytes_per_block = src.format_info()->block_width * + src.format_info()->block_height * + src.format_info()->bits_per_pixel / 8; + // Tiled textures can be packed; get the offset into the packed texture. + uint32_t offset_x; + uint32_t offset_y; + TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); + auto bpp = (bytes_per_block >> 2) + + ((bytes_per_block >> 1) >> (bytes_per_block >> 2)); + for (int face = 0; face < 6; ++face) { + for (uint32_t y = 0, output_base_offset = 0; + y < src.size_cube.block_height; + y++, output_base_offset += src.size_cube.input_pitch) { + auto input_base_offset = TextureInfo::TiledOffset2DOuter( + offset_y + y, + (src.size_cube.input_width / src.format_info()->block_width), + bpp); + for (uint32_t x = 0, output_offset = output_base_offset; + x < src.size_cube.block_width; + x++, output_offset += bytes_per_block) { + auto input_offset = + TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp, + input_base_offset) >> + bpp; + TextureSwap(src.endianness, dest + output_offset, + src_mem + input_offset * bytes_per_block, + bytes_per_block); + } + } + src_mem += src.size_cube.input_face_length; + dest += src.size_cube.input_face_length; + } + + copy_region->bufferRowLength = src.size_cube.input_width; + copy_region->bufferImageHeight = src.size_cube.input_height; + copy_region->imageExtent = {src.size_cube.logical_width, + src.size_cube.logical_height, 6}; + return true; + } + } + return false; +} + +bool TextureCache::ConvertTexture(uint8_t* dest, VkBufferImageCopy* copy_region, + const TextureInfo& src) { + switch (src.dimension) { + case Dimension::k1D: + return ConvertTexture1D(dest, copy_region, src); + case Dimension::k2D: + return ConvertTexture2D(dest, copy_region, src); + case Dimension::kCube: + return ConvertTextureCube(dest, copy_region, src); + } + return false; +} + +bool TextureCache::ComputeTextureStorage(size_t* output_length, + const TextureInfo& src) { + if (src.texture_format == TextureFormat::k_CTX1) { + switch (src.dimension) { + case Dimension::k1D: { + *output_length = src.size_1d.logical_width * 2; + return true; + } + case Dimension::k2D: { + *output_length = + src.size_2d.logical_width * src.size_2d.logical_height * 2; + return true; + } + case Dimension::kCube: { + *output_length = + src.size_cube.logical_width * src.size_cube.logical_height * 2 * 6; + return true; + } + } + return false; + } else { + *output_length = src.input_length; + return true; + } } -bool TextureCache::UploadTexture1D(VkCommandBuffer command_buffer, - VkFence completion_fence, Texture* dest, - const TextureInfo& src) { +bool TextureCache::UploadTexture(VkCommandBuffer command_buffer, + VkFence completion_fence, Texture* dest, + const TextureInfo& src) { #if FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // FINE_GRAINED_DRAW_SCOPES - assert_true(src.dimension == Dimension::k1D); + size_t unpack_length; + if (!ComputeTextureStorage(&unpack_length, src)) { + XELOGW("Failed to compute texture storage"); + return false; + } - size_t unpack_length = src.output_length; if (!staging_buffer_.CanAcquire(unpack_length)) { // Need to have unique memory for every upload for at least one frame. If we // run out of memory, we need to flush all queued upload commands to the @@ -1100,14 +1139,20 @@ bool TextureCache::UploadTexture1D(VkCommandBuffer command_buffer, // TODO: If the GPU supports it, we can submit a compute batch to convert the // texture and copy it to its destination. Otherwise, fallback to conversion // on the CPU. - ConvertTexture1D(reinterpret_cast(alloc->host_ptr), src); - staging_buffer_.Flush(alloc); + VkBufferImageCopy copy_region; + if (!ConvertTexture(reinterpret_cast(alloc->host_ptr), ©_region, + src)) { + XELOGW("Failed to convert texture"); + return false; + } // Transition the texture into a transfer destination layout. VkImageMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barrier.pNext = nullptr; barrier.srcAccessMask = 0; + // TODO(gibbed): is this correct? 1D+cube had VK_ACCESS_HOST_WRITE_BIT, but + // not 2D. barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT; barrier.oldLayout = dest->image_layout; @@ -1116,85 +1161,6 @@ bool TextureCache::UploadTexture1D(VkCommandBuffer command_buffer, barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.image = dest->image; barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; - vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, - VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, - nullptr, 1, &barrier); - - // Now move the converted texture into the destination. - VkBufferImageCopy copy_region; - copy_region.bufferOffset = alloc->offset; - copy_region.bufferRowLength = src.size_1d.output_width; - copy_region.bufferImageHeight = 1; - copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; - copy_region.imageOffset = {0, 0, 0}; - copy_region.imageExtent = {src.size_1d.output_width, 1, 1}; - vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(), - dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, - ©_region); - - // Now transition the texture into a shader readonly source. - barrier.srcAccessMask = barrier.dstAccessMask; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - barrier.oldLayout = barrier.newLayout; - barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, - VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, - nullptr, 1, &barrier); - - dest->image_layout = barrier.newLayout; - return true; -} - -bool TextureCache::UploadTexture2D(VkCommandBuffer command_buffer, - VkFence completion_fence, Texture* dest, - const TextureInfo& src) { -#if FINE_GRAINED_DRAW_SCOPES - SCOPE_profile_cpu_f("gpu"); -#endif // FINE_GRAINED_DRAW_SCOPES - - assert_true(src.dimension == Dimension::k2D); - - size_t unpack_length = src.output_length; - if (!staging_buffer_.CanAcquire(unpack_length)) { - // Need to have unique memory for every upload for at least one frame. If we - // run out of memory, we need to flush all queued upload commands to the - // GPU. - FlushPendingCommands(command_buffer, completion_fence); - - // Uploads have been flushed. Continue. - if (!staging_buffer_.CanAcquire(unpack_length)) { - // The staging buffer isn't big enough to hold this texture. - XELOGE( - "TextureCache staging buffer is too small! (uploading 0x%.8X bytes)", - unpack_length); - assert_always(); - return false; - } - } - - // Grab some temporary memory for staging. - auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence); - assert_not_null(alloc); - - // Upload texture into GPU memory. - // TODO: If the GPU supports it, we can submit a compute batch to convert the - // texture and copy it to its destination. Otherwise, fallback to conversion - // on the CPU. - ConvertTexture2D(reinterpret_cast(alloc->host_ptr), src); - staging_buffer_.Flush(alloc); - - // Transition the texture into a transfer destination layout. - VkImageMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.oldLayout = dest->image_layout; - barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.image = dest->image; - barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; if (dest->format == VK_FORMAT_D16_UNORM_S8_UINT || dest->format == VK_FORMAT_D24_UNORM_S8_UINT || dest->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { @@ -1207,91 +1173,9 @@ bool TextureCache::UploadTexture2D(VkCommandBuffer command_buffer, nullptr, 1, &barrier); // Now move the converted texture into the destination. - VkBufferImageCopy copy_region; copy_region.bufferOffset = alloc->offset; - copy_region.bufferRowLength = src.size_2d.output_width; - copy_region.bufferImageHeight = src.size_2d.output_height; copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; copy_region.imageOffset = {0, 0, 0}; - copy_region.imageExtent = {src.size_2d.output_width, - src.size_2d.output_height, 1}; - vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(), - dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, - ©_region); - - // Now transition the texture into a shader readonly source. - barrier.srcAccessMask = barrier.dstAccessMask; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - barrier.oldLayout = barrier.newLayout; - barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, - VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, - nullptr, 1, &barrier); - - dest->image_layout = barrier.newLayout; - return true; -} - -bool TextureCache::UploadTextureCube(VkCommandBuffer command_buffer, - VkFence completion_fence, Texture* dest, - const TextureInfo& src) { - assert_true(src.dimension == Dimension::kCube); - - size_t unpack_length = src.output_length; - if (!staging_buffer_.CanAcquire(unpack_length)) { - // Need to have unique memory for every upload for at least one frame. If we - // run out of memory, we need to flush all queued upload commands to the - // GPU. - FlushPendingCommands(command_buffer, completion_fence); - - // Uploads have been flushed. Continue. - if (!staging_buffer_.CanAcquire(unpack_length)) { - // The staging buffer isn't big enough to hold this texture. - XELOGE( - "TextureCache staging buffer is too small! (uploading 0x%.8X bytes)", - unpack_length); - assert_always(); - return false; - } - } - - // Grab some temporary memory for staging. - auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence); - assert_not_null(alloc); - - // Upload texture into GPU memory. - // TODO: If the GPU supports it, we can submit a compute batch to convert the - // texture and copy it to its destination. Otherwise, fallback to conversion - // on the CPU. - ConvertTextureCube(reinterpret_cast(alloc->host_ptr), src); - staging_buffer_.Flush(alloc); - - // Transition the texture into a transfer destination layout. - VkImageMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = 0; - barrier.dstAccessMask = - VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT; - barrier.oldLayout = dest->image_layout; - barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.image = dest->image; - barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; - vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, - VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, - nullptr, 1, &barrier); - - // Now move the converted texture into the destination. - VkBufferImageCopy copy_region; - copy_region.bufferOffset = alloc->offset; - copy_region.bufferRowLength = src.size_cube.output_width; - copy_region.bufferImageHeight = src.size_cube.output_height; - copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; - copy_region.imageOffset = {0, 0, 0}; - copy_region.imageExtent = {src.size_cube.output_width, - src.size_cube.output_height, 6}; vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(), dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©_region); diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h index c806910b1..3a3b43f41 100644 --- a/src/xenia/gpu/vulkan/texture_cache.h +++ b/src/xenia/gpu/vulkan/texture_cache.h @@ -142,22 +142,21 @@ class TextureCache { void FlushPendingCommands(VkCommandBuffer command_buffer, VkFence completion_fence); - void ConvertTexture1D(uint8_t* dest, const TextureInfo& src); - void ConvertTexture2D(uint8_t* dest, const TextureInfo& src); - void ConvertTextureCube(uint8_t* dest, const TextureInfo& src); + bool ConvertTexture1D(uint8_t* dest, VkBufferImageCopy* copy_region, + const TextureInfo& src); + bool ConvertTexture2D(uint8_t* dest, VkBufferImageCopy* copy_region, + const TextureInfo& src); + bool ConvertTextureCube(uint8_t* dest, VkBufferImageCopy* copy_region, + const TextureInfo& src); + bool ConvertTexture(uint8_t* dest, VkBufferImageCopy* copy_region, + const TextureInfo& src); + bool ComputeTextureStorage(size_t* output_length, const TextureInfo& src); // Queues commands to upload a texture from system memory, applying any // conversions necessary. This may flush the command buffer to the GPU if we // run out of staging memory. - bool UploadTexture1D(VkCommandBuffer command_buffer, VkFence completion_fence, - Texture* dest, const TextureInfo& src); - - bool UploadTexture2D(VkCommandBuffer command_buffer, VkFence completion_fence, - Texture* dest, const TextureInfo& src); - - bool UploadTextureCube(VkCommandBuffer command_buffer, - VkFence completion_fence, Texture* dest, - const TextureInfo& src); + bool UploadTexture(VkCommandBuffer command_buffer, VkFence completion_fence, + Texture* dest, const TextureInfo& src); void HashTextureBindings(XXH64_state_t* hash_state, uint32_t& fetch_mask, const std::vector& bindings);