diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc index b5c8dfb47..8219b7aa4 100644 --- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc @@ -371,79 +371,79 @@ const D3D12TextureCache::HostFormat D3D12TextureCache::host_formats_[64] = { const D3D12TextureCache::LoadModeInfo D3D12TextureCache::load_mode_info_[] = { {shaders::texture_load_8bpb_cs, sizeof(shaders::texture_load_8bpb_cs), shaders::texture_load_8bpb_scaled_cs, - sizeof(shaders::texture_load_8bpb_scaled_cs), 3, 4, 4, 16}, + sizeof(shaders::texture_load_8bpb_scaled_cs), 3, 4, 1, 4, 16}, {shaders::texture_load_16bpb_cs, sizeof(shaders::texture_load_16bpb_cs), shaders::texture_load_16bpb_scaled_cs, - sizeof(shaders::texture_load_16bpb_scaled_cs), 4, 4, 4, 16}, + sizeof(shaders::texture_load_16bpb_scaled_cs), 4, 4, 2, 4, 16}, {shaders::texture_load_32bpb_cs, sizeof(shaders::texture_load_32bpb_cs), shaders::texture_load_32bpb_scaled_cs, - sizeof(shaders::texture_load_32bpb_scaled_cs), 4, 4, 3, 8}, + sizeof(shaders::texture_load_32bpb_scaled_cs), 4, 4, 4, 3, 8}, {shaders::texture_load_64bpb_cs, sizeof(shaders::texture_load_64bpb_cs), shaders::texture_load_64bpb_scaled_cs, - sizeof(shaders::texture_load_64bpb_scaled_cs), 4, 4, 2, 4}, + sizeof(shaders::texture_load_64bpb_scaled_cs), 4, 4, 8, 2, 4}, {shaders::texture_load_128bpb_cs, sizeof(shaders::texture_load_128bpb_cs), shaders::texture_load_128bpb_scaled_cs, - sizeof(shaders::texture_load_128bpb_scaled_cs), 4, 4, 1, 2}, + sizeof(shaders::texture_load_128bpb_scaled_cs), 4, 4, 16, 1, 2}, {shaders::texture_load_r5g5b5a1_b5g5r5a1_cs, sizeof(shaders::texture_load_r5g5b5a1_b5g5r5a1_cs), shaders::texture_load_r5g5b5a1_b5g5r5a1_scaled_cs, - sizeof(shaders::texture_load_r5g5b5a1_b5g5r5a1_scaled_cs), 4, 4, 4, 16}, + sizeof(shaders::texture_load_r5g5b5a1_b5g5r5a1_scaled_cs), 4, 4, 2, 4, 16}, {shaders::texture_load_r5g6b5_b5g6r5_cs, sizeof(shaders::texture_load_r5g6b5_b5g6r5_cs), shaders::texture_load_r5g6b5_b5g6r5_scaled_cs, - sizeof(shaders::texture_load_r5g6b5_b5g6r5_scaled_cs), 4, 4, 4, 16}, + sizeof(shaders::texture_load_r5g6b5_b5g6r5_scaled_cs), 4, 4, 2, 4, 16}, {shaders::texture_load_r5g5b6_b5g6r5_swizzle_rbga_cs, sizeof(shaders::texture_load_r5g5b6_b5g6r5_swizzle_rbga_cs), shaders::texture_load_r5g5b6_b5g6r5_swizzle_rbga_scaled_cs, sizeof(shaders::texture_load_r5g5b6_b5g6r5_swizzle_rbga_scaled_cs), 4, 4, - 4, 16}, + 2, 4, 16}, {shaders::texture_load_r4g4b4a4_b4g4r4a4_cs, sizeof(shaders::texture_load_r4g4b4a4_b4g4r4a4_cs), shaders::texture_load_r4g4b4a4_b4g4r4a4_scaled_cs, - sizeof(shaders::texture_load_r4g4b4a4_b4g4r4a4_scaled_cs), 4, 4, 4, 16}, + sizeof(shaders::texture_load_r4g4b4a4_b4g4r4a4_scaled_cs), 4, 4, 2, 4, 16}, {shaders::texture_load_r10g11b11_rgba16_cs, sizeof(shaders::texture_load_r10g11b11_rgba16_cs), shaders::texture_load_r10g11b11_rgba16_scaled_cs, - sizeof(shaders::texture_load_r10g11b11_rgba16_scaled_cs), 4, 4, 3, 8}, + sizeof(shaders::texture_load_r10g11b11_rgba16_scaled_cs), 4, 4, 8, 3, 8}, {shaders::texture_load_r10g11b11_rgba16_snorm_cs, sizeof(shaders::texture_load_r10g11b11_rgba16_snorm_cs), shaders::texture_load_r10g11b11_rgba16_snorm_scaled_cs, - sizeof(shaders::texture_load_r10g11b11_rgba16_snorm_scaled_cs), 4, 4, 3, + sizeof(shaders::texture_load_r10g11b11_rgba16_snorm_scaled_cs), 4, 4, 8, 3, 8}, {shaders::texture_load_r11g11b10_rgba16_cs, sizeof(shaders::texture_load_r11g11b10_rgba16_cs), shaders::texture_load_r11g11b10_rgba16_scaled_cs, - sizeof(shaders::texture_load_r11g11b10_rgba16_scaled_cs), 4, 4, 3, 8}, + sizeof(shaders::texture_load_r11g11b10_rgba16_scaled_cs), 4, 4, 8, 3, 8}, {shaders::texture_load_r11g11b10_rgba16_snorm_cs, sizeof(shaders::texture_load_r11g11b10_rgba16_snorm_cs), shaders::texture_load_r11g11b10_rgba16_snorm_scaled_cs, - sizeof(shaders::texture_load_r11g11b10_rgba16_snorm_scaled_cs), 4, 4, 3, + sizeof(shaders::texture_load_r11g11b10_rgba16_snorm_scaled_cs), 4, 4, 8, 3, 8}, {shaders::texture_load_dxt1_rgba8_cs, - sizeof(shaders::texture_load_dxt1_rgba8_cs), nullptr, 0, 4, 4, 2, 16}, + sizeof(shaders::texture_load_dxt1_rgba8_cs), nullptr, 0, 4, 4, 4, 2, 16}, {shaders::texture_load_dxt3_rgba8_cs, - sizeof(shaders::texture_load_dxt3_rgba8_cs), nullptr, 0, 4, 4, 1, 8}, + sizeof(shaders::texture_load_dxt3_rgba8_cs), nullptr, 0, 4, 4, 4, 1, 8}, {shaders::texture_load_dxt5_rgba8_cs, - sizeof(shaders::texture_load_dxt5_rgba8_cs), nullptr, 0, 4, 4, 1, 8}, + sizeof(shaders::texture_load_dxt5_rgba8_cs), nullptr, 0, 4, 4, 4, 1, 8}, {shaders::texture_load_dxn_rg8_cs, sizeof(shaders::texture_load_dxn_rg8_cs), - nullptr, 0, 4, 4, 1, 8}, + nullptr, 0, 4, 4, 2, 1, 8}, {shaders::texture_load_dxt3a_cs, sizeof(shaders::texture_load_dxt3a_cs), - nullptr, 0, 4, 4, 2, 16}, + nullptr, 0, 4, 4, 1, 2, 16}, {shaders::texture_load_dxt3aas1111_bgra4_cs, - sizeof(shaders::texture_load_dxt3aas1111_bgra4_cs), nullptr, 0, 4, 4, 2, + sizeof(shaders::texture_load_dxt3aas1111_bgra4_cs), nullptr, 0, 4, 4, 2, 2, 16}, {shaders::texture_load_dxt5a_r8_cs, - sizeof(shaders::texture_load_dxt5a_r8_cs), nullptr, 0, 4, 4, 2, 16}, + sizeof(shaders::texture_load_dxt5a_r8_cs), nullptr, 0, 4, 4, 1, 2, 16}, {shaders::texture_load_ctx1_cs, sizeof(shaders::texture_load_ctx1_cs), - nullptr, 0, 4, 4, 2, 16}, + nullptr, 0, 4, 4, 2, 2, 16}, {shaders::texture_load_depth_unorm_cs, sizeof(shaders::texture_load_depth_unorm_cs), shaders::texture_load_depth_unorm_scaled_cs, - sizeof(shaders::texture_load_depth_unorm_scaled_cs), 4, 4, 3, 8}, + sizeof(shaders::texture_load_depth_unorm_scaled_cs), 4, 4, 4, 3, 8}, {shaders::texture_load_depth_float_cs, sizeof(shaders::texture_load_depth_float_cs), shaders::texture_load_depth_float_scaled_cs, - sizeof(shaders::texture_load_depth_float_scaled_cs), 4, 4, 3, 8}, + sizeof(shaders::texture_load_depth_float_scaled_cs), 4, 4, 4, 3, 8}, }; D3D12TextureCache::D3D12TextureCache(const RegisterFile& register_file, @@ -1527,6 +1527,15 @@ bool D3D12TextureCache::LoadTextureDataFromResidentMemoryImpl(Texture& texture, texture_resolution_scaled ? draw_resolution_scale_y() : 1; // Get the host layout and the buffer. + uint32_t host_block_width, host_block_height; + if (host_formats_[uint32_t(guest_format)].is_block_compressed && + !IsDecompressionNeeded(guest_format, width, height)) { + host_block_width = block_width; + host_block_height = block_height; + } else { + host_block_width = 1; + host_block_height = 1; + } UINT64 copy_buffer_size = 0; D3D12_PLACED_SUBRESOURCE_FOOTPRINT host_slice_layout_base; UINT64 host_slice_size_base; @@ -1543,13 +1552,8 @@ bool D3D12TextureCache::LoadTextureDataFromResidentMemoryImpl(Texture& texture, // GetCopyableFootprints aligns row offsets, but not the total size) are // properly padded to the number of blocks copied in an invocation without // implicit assumptions about D3D12_TEXTURE_DATA_PITCH_ALIGNMENT. - DXGI_FORMAT host_copy_format; - uint32_t host_block_width; - uint32_t host_block_height; - uint32_t host_bytes_per_block; - ui::d3d12::util::GetFormatCopyInfo( - GetDXGIResourceFormat(guest_format, width, height), 0, host_copy_format, - host_block_width, host_block_height, host_bytes_per_block); + DXGI_FORMAT host_copy_format = + GetDXGIResourceFormat(guest_format, width, height); if (!level_first) { host_slice_layout_base.Offset = copy_buffer_size; host_slice_layout_base.Footprint.Format = host_copy_format; @@ -1576,7 +1580,7 @@ bool D3D12TextureCache::LoadTextureDataFromResidentMemoryImpl(Texture& texture, xe::align(xe::round_up(host_slice_layout_base.Footprint.Width / host_block_width, load_mode_info.host_x_blocks_per_thread) * - host_bytes_per_block, + load_mode_info.bytes_per_host_block, uint32_t(D3D12_TEXTURE_DATA_PITCH_ALIGNMENT)); host_slice_size_base = xe::align( UINT64(host_slice_layout_base.Footprint.RowPitch) * @@ -1621,7 +1625,7 @@ bool D3D12TextureCache::LoadTextureDataFromResidentMemoryImpl(Texture& texture, xe::align(xe::round_up(host_slice_layout_mip.Footprint.Width / host_block_width, load_mode_info.host_x_blocks_per_thread) * - host_bytes_per_block, + load_mode_info.bytes_per_host_block, uint32_t(D3D12_TEXTURE_DATA_PITCH_ALIGNMENT)); UINT64 host_slice_sizes_mip = xe::align( UINT64(host_slice_layout_mip.Footprint.RowPitch) * @@ -1640,13 +1644,6 @@ bool D3D12TextureCache::LoadTextureDataFromResidentMemoryImpl(Texture& texture, if (copy_buffer == nullptr) { return false; } - uint32_t host_block_width = 1; - uint32_t host_block_height = 1; - if (host_formats_[uint32_t(guest_format)].dxgi_format_block_aligned && - !IsDecompressionNeeded(guest_format, width, height)) { - host_block_width = block_width; - host_block_height = block_height; - } // Begin loading. // May use different buffers for scaled base and mips, and also can't address diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.h b/src/xenia/gpu/d3d12/d3d12_texture_cache.h index fe6313cc4..d3cbae934 100644 --- a/src/xenia/gpu/d3d12/d3d12_texture_cache.h +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.h @@ -224,8 +224,12 @@ class D3D12TextureCache final : public TextureCache { // may copy multiple blocks per one invocation. uint32_t srv_bpe_log2; uint32_t uav_bpe_log2; - // Log2 of the number of guest blocks along the X axis loaded by a single - // thread shader group. + // Number of bytes in a host resolution-scaled block (corresponding to a + // guest block if not decompressing, or a host texel if decompressing) + // written by the shader. + uint32_t bytes_per_host_block; + // Log2 of the number of guest resolution-scaled blocks along the X axis + // loaded by a single thread shader group. uint32_t guest_x_blocks_per_thread_log2; // Number of host blocks (or texels for uncompressed) along the X axis // written by a single compute shader thread - rows in the upload buffer are @@ -262,8 +266,9 @@ class D3D12TextureCache final : public TextureCache { // textures and multiplication to the tfetch implementation. // Whether the DXGI format, if not uncompressing the texture, consists of - // blocks, thus copy regions must be aligned to block size. - bool dxgi_format_block_aligned; + // blocks, thus copy regions must be aligned to block size (assuming it's + // the same as the guest block size). + bool is_block_compressed; // Uncompression info for when the regular host format for this texture is // block-compressed, but the size is not block-aligned, and thus such // texture cannot be created in Direct3D on PC and needs decompression, diff --git a/src/xenia/ui/d3d12/d3d12_util.cc b/src/xenia/ui/d3d12/d3d12_util.cc index e7a103140..caea2b296 100644 --- a/src/xenia/ui/d3d12/d3d12_util.cc +++ b/src/xenia/ui/d3d12/d3d12_util.cc @@ -127,188 +127,6 @@ void CreateBufferTypedUAV(ID3D12Device* device, device->CreateUnorderedAccessView(buffer, nullptr, &desc, handle); } -void GetFormatCopyInfo(DXGI_FORMAT format, uint32_t plane, - DXGI_FORMAT& copy_format_out, uint32_t& block_width_out, - uint32_t& block_height_out, - uint32_t& bytes_per_block_out) { - DXGI_FORMAT copy_format = format; - uint32_t block_width = 1; - uint32_t block_height = 1; - uint32_t bytes_per_block = 1; - switch (format) { - case DXGI_FORMAT_R32G32B32A32_TYPELESS: - case DXGI_FORMAT_R32G32B32A32_FLOAT: - case DXGI_FORMAT_R32G32B32A32_UINT: - case DXGI_FORMAT_R32G32B32A32_SINT: - bytes_per_block = 16; - break; - case DXGI_FORMAT_R32G32B32_TYPELESS: - case DXGI_FORMAT_R32G32B32_FLOAT: - case DXGI_FORMAT_R32G32B32_UINT: - case DXGI_FORMAT_R32G32B32_SINT: - bytes_per_block = 12; - break; - case DXGI_FORMAT_R16G16B16A16_TYPELESS: - case DXGI_FORMAT_R16G16B16A16_FLOAT: - case DXGI_FORMAT_R16G16B16A16_UNORM: - case DXGI_FORMAT_R16G16B16A16_UINT: - case DXGI_FORMAT_R16G16B16A16_SNORM: - case DXGI_FORMAT_R16G16B16A16_SINT: - case DXGI_FORMAT_R32G32_TYPELESS: - case DXGI_FORMAT_R32G32_FLOAT: - case DXGI_FORMAT_R32G32_UINT: - case DXGI_FORMAT_R32G32_SINT: - case DXGI_FORMAT_Y416: - bytes_per_block = 8; - break; - case DXGI_FORMAT_R32G8X24_TYPELESS: - case DXGI_FORMAT_D32_FLOAT_S8X24_UINT: - case DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS: - case DXGI_FORMAT_X32_TYPELESS_G8X24_UINT: - case DXGI_FORMAT_R24G8_TYPELESS: - case DXGI_FORMAT_D24_UNORM_S8_UINT: - case DXGI_FORMAT_R24_UNORM_X8_TYPELESS: - case DXGI_FORMAT_X24_TYPELESS_G8_UINT: - if (plane) { - copy_format = DXGI_FORMAT_R8_TYPELESS; - bytes_per_block = 1; - } else { - copy_format = DXGI_FORMAT_R32_TYPELESS; - bytes_per_block = 4; - } - break; - case DXGI_FORMAT_R10G10B10A2_TYPELESS: - case DXGI_FORMAT_R10G10B10A2_UNORM: - case DXGI_FORMAT_R10G10B10A2_UINT: - case DXGI_FORMAT_R11G11B10_FLOAT: - case DXGI_FORMAT_R8G8B8A8_TYPELESS: - case DXGI_FORMAT_R8G8B8A8_UNORM: - case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: - case DXGI_FORMAT_R8G8B8A8_UINT: - case DXGI_FORMAT_R8G8B8A8_SNORM: - case DXGI_FORMAT_R8G8B8A8_SINT: - case DXGI_FORMAT_R16G16_TYPELESS: - case DXGI_FORMAT_R16G16_FLOAT: - case DXGI_FORMAT_R16G16_UNORM: - case DXGI_FORMAT_R16G16_UINT: - case DXGI_FORMAT_R16G16_SNORM: - case DXGI_FORMAT_R16G16_SINT: - case DXGI_FORMAT_R32_TYPELESS: - case DXGI_FORMAT_D32_FLOAT: - case DXGI_FORMAT_R32_FLOAT: - case DXGI_FORMAT_R32_UINT: - case DXGI_FORMAT_R32_SINT: - case DXGI_FORMAT_R9G9B9E5_SHAREDEXP: - case DXGI_FORMAT_B8G8R8A8_UNORM: - case DXGI_FORMAT_B8G8R8X8_UNORM: - case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM: - case DXGI_FORMAT_B8G8R8A8_TYPELESS: - case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: - case DXGI_FORMAT_B8G8R8X8_TYPELESS: - case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB: - case DXGI_FORMAT_AYUV: - case DXGI_FORMAT_Y410: - bytes_per_block = 4; - break; - case DXGI_FORMAT_R8G8_TYPELESS: - case DXGI_FORMAT_R8G8_UNORM: - case DXGI_FORMAT_R8G8_UINT: - case DXGI_FORMAT_R8G8_SNORM: - case DXGI_FORMAT_R8G8_SINT: - case DXGI_FORMAT_R16_TYPELESS: - case DXGI_FORMAT_R16_FLOAT: - case DXGI_FORMAT_D16_UNORM: - case DXGI_FORMAT_R16_UNORM: - case DXGI_FORMAT_R16_UINT: - case DXGI_FORMAT_R16_SNORM: - case DXGI_FORMAT_R16_SINT: - case DXGI_FORMAT_B5G6R5_UNORM: - case DXGI_FORMAT_B5G5R5A1_UNORM: - case DXGI_FORMAT_A8P8: - case DXGI_FORMAT_B4G4R4A4_UNORM: - bytes_per_block = 2; - break; - case DXGI_FORMAT_R8_TYPELESS: - case DXGI_FORMAT_R8_UNORM: - case DXGI_FORMAT_R8_UINT: - case DXGI_FORMAT_R8_SNORM: - case DXGI_FORMAT_R8_SINT: - case DXGI_FORMAT_A8_UNORM: - case DXGI_FORMAT_AI44: - case DXGI_FORMAT_IA44: - case DXGI_FORMAT_P8: - bytes_per_block = 1; - break; - // R1_UNORM is not supported in Direct3D 12. - case DXGI_FORMAT_R8G8_B8G8_UNORM: - case DXGI_FORMAT_G8R8_G8B8_UNORM: - case DXGI_FORMAT_Y210: - case DXGI_FORMAT_Y216: - // Failed to GetCopyableFootprints for Y210 and Y216 on Intel UHD Graphics - // 630. - block_width = 2; - bytes_per_block = 4; - break; - case DXGI_FORMAT_BC1_TYPELESS: - case DXGI_FORMAT_BC1_UNORM: - case DXGI_FORMAT_BC1_UNORM_SRGB: - case DXGI_FORMAT_BC4_TYPELESS: - case DXGI_FORMAT_BC4_UNORM: - case DXGI_FORMAT_BC4_SNORM: - block_width = 4; - block_height = 4; - bytes_per_block = 8; - break; - case DXGI_FORMAT_BC2_TYPELESS: - case DXGI_FORMAT_BC2_UNORM: - case DXGI_FORMAT_BC2_UNORM_SRGB: - case DXGI_FORMAT_BC3_TYPELESS: - case DXGI_FORMAT_BC3_UNORM: - case DXGI_FORMAT_BC3_UNORM_SRGB: - case DXGI_FORMAT_BC5_TYPELESS: - case DXGI_FORMAT_BC5_UNORM: - case DXGI_FORMAT_BC5_SNORM: - case DXGI_FORMAT_BC6H_TYPELESS: - case DXGI_FORMAT_BC6H_UF16: - case DXGI_FORMAT_BC6H_SF16: - case DXGI_FORMAT_BC7_TYPELESS: - case DXGI_FORMAT_BC7_UNORM: - case DXGI_FORMAT_BC7_UNORM_SRGB: - block_width = 4; - block_height = 4; - bytes_per_block = 16; - break; - // NV12, P010, P016, 420_OPAQUE and NV11 are not handled here because of - // differences that need to be handled externally. - // For future reference, if needed: - // - Width and height of planes 1 and 2 are divided by the block size in the - // footprint itself (unlike in block-compressed textures, where the - // dimensions are merely aligned). - // - Rows are aligned to the placement alignment (512) rather than the pitch - // alignment (256) for some reason (to match the Direct3D 11 layout - // without explicit planes, requiring the plane data to be laid out in - // some specific way defined on MSDN within each row, though Direct3D 12 - // possibly doesn't have such requirement, but investigation needed. - // - NV12: R8_TYPELESS plane 0, R8G8_TYPELESS plane 1. - // - P010, P016: R16_TYPELESS plane 0, R16G16_TYPELESS plane 1. Failed to - // GetCopyableFootprints for P016 on Nvidia GeForce GTX 1070. - // - 420_OPAQUE: Single R8_TYPELESS plane. - // - NV11: Failed to GetCopyableFootprints on both Nvidia GeForce GTX 1070 - // and Intel UHD Graphics 630. - case DXGI_FORMAT_YUY2: - block_width = 2; - bytes_per_block = 2; - break; - // P208, V208 and V408 are not supported in Direct3D 12. - default: - assert_unhandled_case(format); - } - copy_format_out = copy_format; - block_width_out = block_width; - block_height_out = block_height; - bytes_per_block_out = bytes_per_block; -} - } // namespace util } // namespace d3d12 } // namespace ui diff --git a/src/xenia/ui/d3d12/d3d12_util.h b/src/xenia/ui/d3d12/d3d12_util.h index ceaed7839..4c8f8776b 100644 --- a/src/xenia/ui/d3d12/d3d12_util.h +++ b/src/xenia/ui/d3d12/d3d12_util.h @@ -93,14 +93,6 @@ void CreateBufferTypedUAV(ID3D12Device* device, ID3D12Resource* buffer, DXGI_FORMAT format, uint32_t num_elements, uint64_t first_element = 0); -// For cases where GetCopyableFootprints isn't usable (such as when the size -// needs to be overaligned beyond the maximum texture size), providing data -// needed to compute the copyable footprints manually. -void GetFormatCopyInfo(DXGI_FORMAT format, uint32_t plane, - DXGI_FORMAT& copy_format_out, uint32_t& block_width_out, - uint32_t& block_height_out, - uint32_t& bytes_per_block_out); - } // namespace util } // namespace d3d12 } // namespace ui