[D3D12] Texture host BPB in LoadModeInfo

This commit is contained in:
Triang3l 2022-05-22 19:28:05 +03:00
parent 003c62ba73
commit 8f06ba6f7d
4 changed files with 45 additions and 233 deletions

View File

@ -371,79 +371,79 @@ const D3D12TextureCache::HostFormat D3D12TextureCache::host_formats_[64] = {
const D3D12TextureCache::LoadModeInfo D3D12TextureCache::load_mode_info_[] = {
{shaders::texture_load_8bpb_cs, sizeof(shaders::texture_load_8bpb_cs),
shaders::texture_load_8bpb_scaled_cs,
sizeof(shaders::texture_load_8bpb_scaled_cs), 3, 4, 4, 16},
sizeof(shaders::texture_load_8bpb_scaled_cs), 3, 4, 1, 4, 16},
{shaders::texture_load_16bpb_cs, sizeof(shaders::texture_load_16bpb_cs),
shaders::texture_load_16bpb_scaled_cs,
sizeof(shaders::texture_load_16bpb_scaled_cs), 4, 4, 4, 16},
sizeof(shaders::texture_load_16bpb_scaled_cs), 4, 4, 2, 4, 16},
{shaders::texture_load_32bpb_cs, sizeof(shaders::texture_load_32bpb_cs),
shaders::texture_load_32bpb_scaled_cs,
sizeof(shaders::texture_load_32bpb_scaled_cs), 4, 4, 3, 8},
sizeof(shaders::texture_load_32bpb_scaled_cs), 4, 4, 4, 3, 8},
{shaders::texture_load_64bpb_cs, sizeof(shaders::texture_load_64bpb_cs),
shaders::texture_load_64bpb_scaled_cs,
sizeof(shaders::texture_load_64bpb_scaled_cs), 4, 4, 2, 4},
sizeof(shaders::texture_load_64bpb_scaled_cs), 4, 4, 8, 2, 4},
{shaders::texture_load_128bpb_cs, sizeof(shaders::texture_load_128bpb_cs),
shaders::texture_load_128bpb_scaled_cs,
sizeof(shaders::texture_load_128bpb_scaled_cs), 4, 4, 1, 2},
sizeof(shaders::texture_load_128bpb_scaled_cs), 4, 4, 16, 1, 2},
{shaders::texture_load_r5g5b5a1_b5g5r5a1_cs,
sizeof(shaders::texture_load_r5g5b5a1_b5g5r5a1_cs),
shaders::texture_load_r5g5b5a1_b5g5r5a1_scaled_cs,
sizeof(shaders::texture_load_r5g5b5a1_b5g5r5a1_scaled_cs), 4, 4, 4, 16},
sizeof(shaders::texture_load_r5g5b5a1_b5g5r5a1_scaled_cs), 4, 4, 2, 4, 16},
{shaders::texture_load_r5g6b5_b5g6r5_cs,
sizeof(shaders::texture_load_r5g6b5_b5g6r5_cs),
shaders::texture_load_r5g6b5_b5g6r5_scaled_cs,
sizeof(shaders::texture_load_r5g6b5_b5g6r5_scaled_cs), 4, 4, 4, 16},
sizeof(shaders::texture_load_r5g6b5_b5g6r5_scaled_cs), 4, 4, 2, 4, 16},
{shaders::texture_load_r5g5b6_b5g6r5_swizzle_rbga_cs,
sizeof(shaders::texture_load_r5g5b6_b5g6r5_swizzle_rbga_cs),
shaders::texture_load_r5g5b6_b5g6r5_swizzle_rbga_scaled_cs,
sizeof(shaders::texture_load_r5g5b6_b5g6r5_swizzle_rbga_scaled_cs), 4, 4,
4, 16},
2, 4, 16},
{shaders::texture_load_r4g4b4a4_b4g4r4a4_cs,
sizeof(shaders::texture_load_r4g4b4a4_b4g4r4a4_cs),
shaders::texture_load_r4g4b4a4_b4g4r4a4_scaled_cs,
sizeof(shaders::texture_load_r4g4b4a4_b4g4r4a4_scaled_cs), 4, 4, 4, 16},
sizeof(shaders::texture_load_r4g4b4a4_b4g4r4a4_scaled_cs), 4, 4, 2, 4, 16},
{shaders::texture_load_r10g11b11_rgba16_cs,
sizeof(shaders::texture_load_r10g11b11_rgba16_cs),
shaders::texture_load_r10g11b11_rgba16_scaled_cs,
sizeof(shaders::texture_load_r10g11b11_rgba16_scaled_cs), 4, 4, 3, 8},
sizeof(shaders::texture_load_r10g11b11_rgba16_scaled_cs), 4, 4, 8, 3, 8},
{shaders::texture_load_r10g11b11_rgba16_snorm_cs,
sizeof(shaders::texture_load_r10g11b11_rgba16_snorm_cs),
shaders::texture_load_r10g11b11_rgba16_snorm_scaled_cs,
sizeof(shaders::texture_load_r10g11b11_rgba16_snorm_scaled_cs), 4, 4, 3,
sizeof(shaders::texture_load_r10g11b11_rgba16_snorm_scaled_cs), 4, 4, 8, 3,
8},
{shaders::texture_load_r11g11b10_rgba16_cs,
sizeof(shaders::texture_load_r11g11b10_rgba16_cs),
shaders::texture_load_r11g11b10_rgba16_scaled_cs,
sizeof(shaders::texture_load_r11g11b10_rgba16_scaled_cs), 4, 4, 3, 8},
sizeof(shaders::texture_load_r11g11b10_rgba16_scaled_cs), 4, 4, 8, 3, 8},
{shaders::texture_load_r11g11b10_rgba16_snorm_cs,
sizeof(shaders::texture_load_r11g11b10_rgba16_snorm_cs),
shaders::texture_load_r11g11b10_rgba16_snorm_scaled_cs,
sizeof(shaders::texture_load_r11g11b10_rgba16_snorm_scaled_cs), 4, 4, 3,
sizeof(shaders::texture_load_r11g11b10_rgba16_snorm_scaled_cs), 4, 4, 8, 3,
8},
{shaders::texture_load_dxt1_rgba8_cs,
sizeof(shaders::texture_load_dxt1_rgba8_cs), nullptr, 0, 4, 4, 2, 16},
sizeof(shaders::texture_load_dxt1_rgba8_cs), nullptr, 0, 4, 4, 4, 2, 16},
{shaders::texture_load_dxt3_rgba8_cs,
sizeof(shaders::texture_load_dxt3_rgba8_cs), nullptr, 0, 4, 4, 1, 8},
sizeof(shaders::texture_load_dxt3_rgba8_cs), nullptr, 0, 4, 4, 4, 1, 8},
{shaders::texture_load_dxt5_rgba8_cs,
sizeof(shaders::texture_load_dxt5_rgba8_cs), nullptr, 0, 4, 4, 1, 8},
sizeof(shaders::texture_load_dxt5_rgba8_cs), nullptr, 0, 4, 4, 4, 1, 8},
{shaders::texture_load_dxn_rg8_cs, sizeof(shaders::texture_load_dxn_rg8_cs),
nullptr, 0, 4, 4, 1, 8},
nullptr, 0, 4, 4, 2, 1, 8},
{shaders::texture_load_dxt3a_cs, sizeof(shaders::texture_load_dxt3a_cs),
nullptr, 0, 4, 4, 2, 16},
nullptr, 0, 4, 4, 1, 2, 16},
{shaders::texture_load_dxt3aas1111_bgra4_cs,
sizeof(shaders::texture_load_dxt3aas1111_bgra4_cs), nullptr, 0, 4, 4, 2,
sizeof(shaders::texture_load_dxt3aas1111_bgra4_cs), nullptr, 0, 4, 4, 2, 2,
16},
{shaders::texture_load_dxt5a_r8_cs,
sizeof(shaders::texture_load_dxt5a_r8_cs), nullptr, 0, 4, 4, 2, 16},
sizeof(shaders::texture_load_dxt5a_r8_cs), nullptr, 0, 4, 4, 1, 2, 16},
{shaders::texture_load_ctx1_cs, sizeof(shaders::texture_load_ctx1_cs),
nullptr, 0, 4, 4, 2, 16},
nullptr, 0, 4, 4, 2, 2, 16},
{shaders::texture_load_depth_unorm_cs,
sizeof(shaders::texture_load_depth_unorm_cs),
shaders::texture_load_depth_unorm_scaled_cs,
sizeof(shaders::texture_load_depth_unorm_scaled_cs), 4, 4, 3, 8},
sizeof(shaders::texture_load_depth_unorm_scaled_cs), 4, 4, 4, 3, 8},
{shaders::texture_load_depth_float_cs,
sizeof(shaders::texture_load_depth_float_cs),
shaders::texture_load_depth_float_scaled_cs,
sizeof(shaders::texture_load_depth_float_scaled_cs), 4, 4, 3, 8},
sizeof(shaders::texture_load_depth_float_scaled_cs), 4, 4, 4, 3, 8},
};
D3D12TextureCache::D3D12TextureCache(const RegisterFile& register_file,
@ -1527,6 +1527,15 @@ bool D3D12TextureCache::LoadTextureDataFromResidentMemoryImpl(Texture& texture,
texture_resolution_scaled ? draw_resolution_scale_y() : 1;
// Get the host layout and the buffer.
uint32_t host_block_width, host_block_height;
if (host_formats_[uint32_t(guest_format)].is_block_compressed &&
!IsDecompressionNeeded(guest_format, width, height)) {
host_block_width = block_width;
host_block_height = block_height;
} else {
host_block_width = 1;
host_block_height = 1;
}
UINT64 copy_buffer_size = 0;
D3D12_PLACED_SUBRESOURCE_FOOTPRINT host_slice_layout_base;
UINT64 host_slice_size_base;
@ -1543,13 +1552,8 @@ bool D3D12TextureCache::LoadTextureDataFromResidentMemoryImpl(Texture& texture,
// GetCopyableFootprints aligns row offsets, but not the total size) are
// properly padded to the number of blocks copied in an invocation without
// implicit assumptions about D3D12_TEXTURE_DATA_PITCH_ALIGNMENT.
DXGI_FORMAT host_copy_format;
uint32_t host_block_width;
uint32_t host_block_height;
uint32_t host_bytes_per_block;
ui::d3d12::util::GetFormatCopyInfo(
GetDXGIResourceFormat(guest_format, width, height), 0, host_copy_format,
host_block_width, host_block_height, host_bytes_per_block);
DXGI_FORMAT host_copy_format =
GetDXGIResourceFormat(guest_format, width, height);
if (!level_first) {
host_slice_layout_base.Offset = copy_buffer_size;
host_slice_layout_base.Footprint.Format = host_copy_format;
@ -1576,7 +1580,7 @@ bool D3D12TextureCache::LoadTextureDataFromResidentMemoryImpl(Texture& texture,
xe::align(xe::round_up(host_slice_layout_base.Footprint.Width /
host_block_width,
load_mode_info.host_x_blocks_per_thread) *
host_bytes_per_block,
load_mode_info.bytes_per_host_block,
uint32_t(D3D12_TEXTURE_DATA_PITCH_ALIGNMENT));
host_slice_size_base = xe::align(
UINT64(host_slice_layout_base.Footprint.RowPitch) *
@ -1621,7 +1625,7 @@ bool D3D12TextureCache::LoadTextureDataFromResidentMemoryImpl(Texture& texture,
xe::align(xe::round_up(host_slice_layout_mip.Footprint.Width /
host_block_width,
load_mode_info.host_x_blocks_per_thread) *
host_bytes_per_block,
load_mode_info.bytes_per_host_block,
uint32_t(D3D12_TEXTURE_DATA_PITCH_ALIGNMENT));
UINT64 host_slice_sizes_mip = xe::align(
UINT64(host_slice_layout_mip.Footprint.RowPitch) *
@ -1640,13 +1644,6 @@ bool D3D12TextureCache::LoadTextureDataFromResidentMemoryImpl(Texture& texture,
if (copy_buffer == nullptr) {
return false;
}
uint32_t host_block_width = 1;
uint32_t host_block_height = 1;
if (host_formats_[uint32_t(guest_format)].dxgi_format_block_aligned &&
!IsDecompressionNeeded(guest_format, width, height)) {
host_block_width = block_width;
host_block_height = block_height;
}
// Begin loading.
// May use different buffers for scaled base and mips, and also can't address

View File

@ -224,8 +224,12 @@ class D3D12TextureCache final : public TextureCache {
// may copy multiple blocks per one invocation.
uint32_t srv_bpe_log2;
uint32_t uav_bpe_log2;
// Log2 of the number of guest blocks along the X axis loaded by a single
// thread shader group.
// Number of bytes in a host resolution-scaled block (corresponding to a
// guest block if not decompressing, or a host texel if decompressing)
// written by the shader.
uint32_t bytes_per_host_block;
// Log2 of the number of guest resolution-scaled blocks along the X axis
// loaded by a single thread shader group.
uint32_t guest_x_blocks_per_thread_log2;
// Number of host blocks (or texels for uncompressed) along the X axis
// written by a single compute shader thread - rows in the upload buffer are
@ -262,8 +266,9 @@ class D3D12TextureCache final : public TextureCache {
// textures and multiplication to the tfetch implementation.
// Whether the DXGI format, if not uncompressing the texture, consists of
// blocks, thus copy regions must be aligned to block size.
bool dxgi_format_block_aligned;
// blocks, thus copy regions must be aligned to block size (assuming it's
// the same as the guest block size).
bool is_block_compressed;
// Uncompression info for when the regular host format for this texture is
// block-compressed, but the size is not block-aligned, and thus such
// texture cannot be created in Direct3D on PC and needs decompression,

View File

@ -127,188 +127,6 @@ void CreateBufferTypedUAV(ID3D12Device* device,
device->CreateUnorderedAccessView(buffer, nullptr, &desc, handle);
}
void GetFormatCopyInfo(DXGI_FORMAT format, uint32_t plane,
DXGI_FORMAT& copy_format_out, uint32_t& block_width_out,
uint32_t& block_height_out,
uint32_t& bytes_per_block_out) {
DXGI_FORMAT copy_format = format;
uint32_t block_width = 1;
uint32_t block_height = 1;
uint32_t bytes_per_block = 1;
switch (format) {
case DXGI_FORMAT_R32G32B32A32_TYPELESS:
case DXGI_FORMAT_R32G32B32A32_FLOAT:
case DXGI_FORMAT_R32G32B32A32_UINT:
case DXGI_FORMAT_R32G32B32A32_SINT:
bytes_per_block = 16;
break;
case DXGI_FORMAT_R32G32B32_TYPELESS:
case DXGI_FORMAT_R32G32B32_FLOAT:
case DXGI_FORMAT_R32G32B32_UINT:
case DXGI_FORMAT_R32G32B32_SINT:
bytes_per_block = 12;
break;
case DXGI_FORMAT_R16G16B16A16_TYPELESS:
case DXGI_FORMAT_R16G16B16A16_FLOAT:
case DXGI_FORMAT_R16G16B16A16_UNORM:
case DXGI_FORMAT_R16G16B16A16_UINT:
case DXGI_FORMAT_R16G16B16A16_SNORM:
case DXGI_FORMAT_R16G16B16A16_SINT:
case DXGI_FORMAT_R32G32_TYPELESS:
case DXGI_FORMAT_R32G32_FLOAT:
case DXGI_FORMAT_R32G32_UINT:
case DXGI_FORMAT_R32G32_SINT:
case DXGI_FORMAT_Y416:
bytes_per_block = 8;
break;
case DXGI_FORMAT_R32G8X24_TYPELESS:
case DXGI_FORMAT_D32_FLOAT_S8X24_UINT:
case DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS:
case DXGI_FORMAT_X32_TYPELESS_G8X24_UINT:
case DXGI_FORMAT_R24G8_TYPELESS:
case DXGI_FORMAT_D24_UNORM_S8_UINT:
case DXGI_FORMAT_R24_UNORM_X8_TYPELESS:
case DXGI_FORMAT_X24_TYPELESS_G8_UINT:
if (plane) {
copy_format = DXGI_FORMAT_R8_TYPELESS;
bytes_per_block = 1;
} else {
copy_format = DXGI_FORMAT_R32_TYPELESS;
bytes_per_block = 4;
}
break;
case DXGI_FORMAT_R10G10B10A2_TYPELESS:
case DXGI_FORMAT_R10G10B10A2_UNORM:
case DXGI_FORMAT_R10G10B10A2_UINT:
case DXGI_FORMAT_R11G11B10_FLOAT:
case DXGI_FORMAT_R8G8B8A8_TYPELESS:
case DXGI_FORMAT_R8G8B8A8_UNORM:
case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
case DXGI_FORMAT_R8G8B8A8_UINT:
case DXGI_FORMAT_R8G8B8A8_SNORM:
case DXGI_FORMAT_R8G8B8A8_SINT:
case DXGI_FORMAT_R16G16_TYPELESS:
case DXGI_FORMAT_R16G16_FLOAT:
case DXGI_FORMAT_R16G16_UNORM:
case DXGI_FORMAT_R16G16_UINT:
case DXGI_FORMAT_R16G16_SNORM:
case DXGI_FORMAT_R16G16_SINT:
case DXGI_FORMAT_R32_TYPELESS:
case DXGI_FORMAT_D32_FLOAT:
case DXGI_FORMAT_R32_FLOAT:
case DXGI_FORMAT_R32_UINT:
case DXGI_FORMAT_R32_SINT:
case DXGI_FORMAT_R9G9B9E5_SHAREDEXP:
case DXGI_FORMAT_B8G8R8A8_UNORM:
case DXGI_FORMAT_B8G8R8X8_UNORM:
case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM:
case DXGI_FORMAT_B8G8R8A8_TYPELESS:
case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
case DXGI_FORMAT_B8G8R8X8_TYPELESS:
case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
case DXGI_FORMAT_AYUV:
case DXGI_FORMAT_Y410:
bytes_per_block = 4;
break;
case DXGI_FORMAT_R8G8_TYPELESS:
case DXGI_FORMAT_R8G8_UNORM:
case DXGI_FORMAT_R8G8_UINT:
case DXGI_FORMAT_R8G8_SNORM:
case DXGI_FORMAT_R8G8_SINT:
case DXGI_FORMAT_R16_TYPELESS:
case DXGI_FORMAT_R16_FLOAT:
case DXGI_FORMAT_D16_UNORM:
case DXGI_FORMAT_R16_UNORM:
case DXGI_FORMAT_R16_UINT:
case DXGI_FORMAT_R16_SNORM:
case DXGI_FORMAT_R16_SINT:
case DXGI_FORMAT_B5G6R5_UNORM:
case DXGI_FORMAT_B5G5R5A1_UNORM:
case DXGI_FORMAT_A8P8:
case DXGI_FORMAT_B4G4R4A4_UNORM:
bytes_per_block = 2;
break;
case DXGI_FORMAT_R8_TYPELESS:
case DXGI_FORMAT_R8_UNORM:
case DXGI_FORMAT_R8_UINT:
case DXGI_FORMAT_R8_SNORM:
case DXGI_FORMAT_R8_SINT:
case DXGI_FORMAT_A8_UNORM:
case DXGI_FORMAT_AI44:
case DXGI_FORMAT_IA44:
case DXGI_FORMAT_P8:
bytes_per_block = 1;
break;
// R1_UNORM is not supported in Direct3D 12.
case DXGI_FORMAT_R8G8_B8G8_UNORM:
case DXGI_FORMAT_G8R8_G8B8_UNORM:
case DXGI_FORMAT_Y210:
case DXGI_FORMAT_Y216:
// Failed to GetCopyableFootprints for Y210 and Y216 on Intel UHD Graphics
// 630.
block_width = 2;
bytes_per_block = 4;
break;
case DXGI_FORMAT_BC1_TYPELESS:
case DXGI_FORMAT_BC1_UNORM:
case DXGI_FORMAT_BC1_UNORM_SRGB:
case DXGI_FORMAT_BC4_TYPELESS:
case DXGI_FORMAT_BC4_UNORM:
case DXGI_FORMAT_BC4_SNORM:
block_width = 4;
block_height = 4;
bytes_per_block = 8;
break;
case DXGI_FORMAT_BC2_TYPELESS:
case DXGI_FORMAT_BC2_UNORM:
case DXGI_FORMAT_BC2_UNORM_SRGB:
case DXGI_FORMAT_BC3_TYPELESS:
case DXGI_FORMAT_BC3_UNORM:
case DXGI_FORMAT_BC3_UNORM_SRGB:
case DXGI_FORMAT_BC5_TYPELESS:
case DXGI_FORMAT_BC5_UNORM:
case DXGI_FORMAT_BC5_SNORM:
case DXGI_FORMAT_BC6H_TYPELESS:
case DXGI_FORMAT_BC6H_UF16:
case DXGI_FORMAT_BC6H_SF16:
case DXGI_FORMAT_BC7_TYPELESS:
case DXGI_FORMAT_BC7_UNORM:
case DXGI_FORMAT_BC7_UNORM_SRGB:
block_width = 4;
block_height = 4;
bytes_per_block = 16;
break;
// NV12, P010, P016, 420_OPAQUE and NV11 are not handled here because of
// differences that need to be handled externally.
// For future reference, if needed:
// - Width and height of planes 1 and 2 are divided by the block size in the
// footprint itself (unlike in block-compressed textures, where the
// dimensions are merely aligned).
// - Rows are aligned to the placement alignment (512) rather than the pitch
// alignment (256) for some reason (to match the Direct3D 11 layout
// without explicit planes, requiring the plane data to be laid out in
// some specific way defined on MSDN within each row, though Direct3D 12
// possibly doesn't have such requirement, but investigation needed.
// - NV12: R8_TYPELESS plane 0, R8G8_TYPELESS plane 1.
// - P010, P016: R16_TYPELESS plane 0, R16G16_TYPELESS plane 1. Failed to
// GetCopyableFootprints for P016 on Nvidia GeForce GTX 1070.
// - 420_OPAQUE: Single R8_TYPELESS plane.
// - NV11: Failed to GetCopyableFootprints on both Nvidia GeForce GTX 1070
// and Intel UHD Graphics 630.
case DXGI_FORMAT_YUY2:
block_width = 2;
bytes_per_block = 2;
break;
// P208, V208 and V408 are not supported in Direct3D 12.
default:
assert_unhandled_case(format);
}
copy_format_out = copy_format;
block_width_out = block_width;
block_height_out = block_height;
bytes_per_block_out = bytes_per_block;
}
} // namespace util
} // namespace d3d12
} // namespace ui

View File

@ -93,14 +93,6 @@ void CreateBufferTypedUAV(ID3D12Device* device,
ID3D12Resource* buffer, DXGI_FORMAT format,
uint32_t num_elements, uint64_t first_element = 0);
// For cases where GetCopyableFootprints isn't usable (such as when the size
// needs to be overaligned beyond the maximum texture size), providing data
// needed to compute the copyable footprints manually.
void GetFormatCopyInfo(DXGI_FORMAT format, uint32_t plane,
DXGI_FORMAT& copy_format_out, uint32_t& block_width_out,
uint32_t& block_height_out,
uint32_t& bytes_per_block_out);
} // namespace util
} // namespace d3d12
} // namespace ui