From 428095f62ad7040edf3df669938db2679399f5ba Mon Sep 17 00:00:00 2001 From: Triang3l Date: Wed, 15 Aug 2018 16:27:13 +0300 Subject: [PATCH] [D3D12] CTX1 texture loading shader --- .../gpu/d3d12/shaders/texture_copy.hlsli | 13 +-- .../d3d12/shaders/texture_load_128bpb.cs.hlsl | 6 +- .../d3d12/shaders/texture_load_16bpb.cs.hlsl | 6 +- .../d3d12/shaders/texture_load_32bpb.cs.hlsl | 6 +- .../d3d12/shaders/texture_load_64bpb.cs.hlsl | 6 +- .../d3d12/shaders/texture_load_8bpb.cs.hlsl | 6 +- .../d3d12/shaders/texture_load_ctx1.cs.hlsl | 101 ++++++++++++++++++ src/xenia/gpu/d3d12/texture_cache.cc | 23 ++-- src/xenia/gpu/d3d12/texture_cache.h | 9 +- 9 files changed, 143 insertions(+), 33 deletions(-) create mode 100644 src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl diff --git a/src/xenia/gpu/d3d12/shaders/texture_copy.hlsli b/src/xenia/gpu/d3d12/shaders/texture_copy.hlsli index ee7aa1ef5..20ddf8c44 100644 --- a/src/xenia/gpu/d3d12/shaders/texture_copy.hlsli +++ b/src/xenia/gpu/d3d12/shaders/texture_copy.hlsli @@ -11,13 +11,14 @@ cbuffer xe_texture_copy_constants : register(b0) { uint xe_texture_copy_host_base; uint xe_texture_copy_host_pitch; - // Size in blocks. - uint3 xe_texture_copy_size; + uint3 xe_texture_copy_size_texels; bool xe_texture_copy_is_3d; + uint3 xe_texture_copy_size_blocks; + uint xe_texture_copy_endianness; + // Offset within the packed mip for small mips. uint3 xe_texture_copy_guest_mip_offset; - uint xe_texture_copy_endianness; }; #define XeTextureCopyGuestPitchTiled 0xFFFFFFFFu @@ -33,15 +34,15 @@ uint4 XeTextureCopyGuestBlockOffsets(uint3 block_index, uint bpb, [branch] if (xe_texture_copy_guest_pitch == XeTextureCopyGuestPitchTiled) { [branch] if (xe_texture_copy_is_3d) { block_offsets_guest = XeTextureTiledOffset3D( - block_index_guest, xe_texture_copy_size.xy, bpb_log2); + block_index_guest, xe_texture_copy_size_blocks.xy, bpb_log2); } else { block_offsets_guest = XeTextureTiledOffset2D( - block_index_guest.xy, xe_texture_copy_size.x, bpb_log2); + block_index_guest.xy, xe_texture_copy_size_blocks.x, bpb_log2); } } else { block_offsets_guest = uint4(0u, 1u, 2u, 3u) * bpb + XeTextureGuestLinearOffset( - block_index_guest, xe_texture_copy_size.y, + block_index_guest, xe_texture_copy_size_blocks.y, xe_texture_copy_guest_pitch, 16u); } return block_offsets_guest + xe_texture_copy_guest_base; diff --git a/src/xenia/gpu/d3d12/shaders/texture_load_128bpb.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_load_128bpb.cs.hlsl index c574dc446..793f90071 100644 --- a/src/xenia/gpu/d3d12/shaders/texture_load_128bpb.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/texture_load_128bpb.cs.hlsl @@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { // 1 thread = 4 uint4 blocks. uint3 block_index = xe_thread_id; block_index.x <<= 2u; - [branch] if (any(block_index >= xe_texture_copy_size)) { + [branch] if (any(block_index >= xe_texture_copy_size_blocks)) { return; } uint4 block_offsets_guest = @@ -19,8 +19,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { block_2 = XeByteSwap(block_2, xe_texture_copy_endianness); block_3 = XeByteSwap(block_3, xe_texture_copy_endianness); uint block_offset_host = XeTextureHostLinearOffset( - block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 16u) + - xe_texture_copy_host_base; + block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch, + 16u) + xe_texture_copy_host_base; uint4 block_offsets_host = uint4(0u, 16u, 32u, 48u) + block_offset_host; xe_texture_copy_dest.Store4(block_offsets_host.x, block_0); xe_texture_copy_dest.Store4(block_offsets_host.y, block_1); diff --git a/src/xenia/gpu/d3d12/shaders/texture_load_16bpb.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_load_16bpb.cs.hlsl index 291d10227..da8be7c4b 100644 --- a/src/xenia/gpu/d3d12/shaders/texture_load_16bpb.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/texture_load_16bpb.cs.hlsl @@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { // 1 thread = 4 ushort blocks. uint3 block_index = xe_thread_id; block_index.x <<= 2u; - [branch] if (any(block_index >= xe_texture_copy_size)) { + [branch] if (any(block_index >= xe_texture_copy_size_blocks)) { return; } uint4 block_offsets_guest = @@ -18,8 +18,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { blocks = (blocks >> ((block_offsets_guest & 2u) << 3u)) & 0xFFFFu; blocks = XeByteSwap16(blocks, xe_texture_copy_endianness); uint block_offset_host = XeTextureHostLinearOffset( - block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 2u) + - xe_texture_copy_host_base; + block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch, + 2u) + xe_texture_copy_host_base; xe_texture_copy_dest.Store2(block_offset_host, blocks.xz | (blocks.yw << 16u)); } diff --git a/src/xenia/gpu/d3d12/shaders/texture_load_32bpb.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_load_32bpb.cs.hlsl index bfef0d9c3..cfc39b1b3 100644 --- a/src/xenia/gpu/d3d12/shaders/texture_load_32bpb.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/texture_load_32bpb.cs.hlsl @@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { // 1 thread = 4 uint blocks. uint3 block_index = xe_thread_id; block_index.x <<= 2u; - [branch] if (any(block_index >= xe_texture_copy_size)) { + [branch] if (any(block_index >= xe_texture_copy_size_blocks)) { return; } uint4 block_offsets_guest = @@ -16,7 +16,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { xe_texture_copy_source.Load(block_offsets_guest.w)); blocks = XeByteSwap(blocks, xe_texture_copy_endianness); uint block_offset_host = XeTextureHostLinearOffset( - block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 4u) + - xe_texture_copy_host_base; + block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch, + 4u) + xe_texture_copy_host_base; xe_texture_copy_dest.Store4(block_offset_host, blocks); } diff --git a/src/xenia/gpu/d3d12/shaders/texture_load_64bpb.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_load_64bpb.cs.hlsl index f886237aa..a29e9430a 100644 --- a/src/xenia/gpu/d3d12/shaders/texture_load_64bpb.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/texture_load_64bpb.cs.hlsl @@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { // 1 thread = 4 uint2 blocks. uint3 block_index = xe_thread_id; block_index.x <<= 2u; - [branch] if (any(block_index >= xe_texture_copy_size)) { + [branch] if (any(block_index >= xe_texture_copy_size_blocks)) { return; } uint4 block_offsets_guest = @@ -17,8 +17,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { blocks_01 = XeByteSwap(blocks_01, xe_texture_copy_endianness); blocks_23 = XeByteSwap(blocks_23, xe_texture_copy_endianness); uint block_offset_host = XeTextureHostLinearOffset( - block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 8u) + - xe_texture_copy_host_base; + block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch, + 8u) + xe_texture_copy_host_base; xe_texture_copy_dest.Store4(block_offset_host, blocks_01); xe_texture_copy_dest.Store4(block_offset_host + 16u, blocks_23); } diff --git a/src/xenia/gpu/d3d12/shaders/texture_load_8bpb.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_load_8bpb.cs.hlsl index 59e5357e2..d4386b8ab 100644 --- a/src/xenia/gpu/d3d12/shaders/texture_load_8bpb.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/texture_load_8bpb.cs.hlsl @@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { // 1 thread = 4 ubyte blocks. uint3 block_index = xe_thread_id; block_index.x <<= 2u; - [branch] if (any(block_index >= xe_texture_copy_size)) { + [branch] if (any(block_index >= xe_texture_copy_size_blocks)) { return; } uint4 block_offsets_guest = @@ -20,7 +20,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { blocks.xy |= blocks.zw; blocks.x |= blocks.y; uint block_offset_host = XeTextureHostLinearOffset( - block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 1u) + - xe_texture_copy_host_base; + block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch, + 1u) + xe_texture_copy_host_base; xe_texture_copy_dest.Store(block_offset_host, blocks.x); } diff --git a/src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl new file mode 100644 index 000000000..12101ad83 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl @@ -0,0 +1,101 @@ +#include "texture_copy.hlsli" + +// http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf +// CXT1 is like DXT3/5 color, but 2-component and with 8:8 endpoints rather than +// 5:6:5. +// +// Dword 1: +// rrrrrrrr gggggggg +// RRRRRRRR GGGGGGGG +// Dword 2: +// AA BB CC DD +// EE FF GG HH +// II JJ KK LL +// MM NN OO PP + +void XeCTX1FourBlocksRowToR8G8(uint4 weights_high, uint weights_shift, + uint4 end_low_rr00gg00, uint4 end_high_rr00gg00, + out uint4 row_01, out uint4 row_23) { + uint4 weights_low = ~weights_high; + uint4 weights_shifts = uint4(0u, 2u, 4u, 6u) + weights_shift; + uint4 row_3aaaa = + ((weights_low >> weights_shifts.x) & 3u) * end_low_rr00gg00 + + ((weights_high >> weights_shifts.x) & 3u) * end_high_rr00gg00; + uint4 row_3bbbb = + ((weights_low >> weights_shifts.y) & 3u) * end_low_rr00gg00 + + ((weights_high >> weights_shifts.y) & 3u) * end_high_rr00gg00; + uint4 row_3cccc = + ((weights_low >> weights_shifts.z) & 3u) * end_low_rr00gg00 + + ((weights_high >> weights_shifts.z) & 3u) * end_high_rr00gg00; + uint4 row_3dddd = + ((weights_low >> weights_shifts.w) & 3u) * end_low_rr00gg00 + + ((weights_high >> weights_shifts.w) & 3u) * end_high_rr00gg00; + uint4 row_half_3acac = uint4(row_3aaaa.xy, row_3cccc.xy).xzyw; + uint4 row_half_3bdbd = uint4(row_3bbbb.xy, row_3dddd.xy).xzyw; + // R0A G0A R0B G0B | R0C G0C R0D G0D | R1A G1A R1B G1B | R1C G1C R1D G1D + row_01 = ((row_half_3acac & 0xFFFFu) / 3u) | + (((row_half_3acac >> 16u) / 3u) << 8u) | + (((row_half_3bdbd & 0xFFFFu) / 3u) << 16u) | + (((row_half_3bdbd >> 16u) / 3u) << 24u); + row_half_3acac = uint4(row_3aaaa.zw, row_3cccc.zw).xzyw; + row_half_3bdbd = uint4(row_3bbbb.zw, row_3dddd.zw).xzyw; + // R2A G2A R2B G2B | R2C G2C R2D G2D | R3A G3A R3B G3B | R3C G3C R3D G3D + row_23 = ((row_half_3acac & 0xFFFFu) / 3u) | + (((row_half_3acac >> 16u) / 3u) << 8u) | + (((row_half_3bdbd & 0xFFFFu) / 3u) << 16u) | + (((row_half_3bdbd >> 16u) / 3u) << 24u); +} + +[numthreads(8, 32, 1)] +void main(uint3 xe_thread_id : SV_DispatchThreadID) { + // 1 thread = 4 CTX1 (8bpb) blocks to 16x4 R8G8 texels. + uint3 block_index = xe_thread_id; + block_index.x <<= 2u; + [branch] if (any(block_index >= xe_texture_copy_size_blocks)) { + return; + } + uint4 block_offsets_guest = + XeTextureCopyGuestBlockOffsets(block_index, 8u, 3u); + uint4 blocks_01 = uint4(xe_texture_copy_source.Load2(block_offsets_guest.x), + xe_texture_copy_source.Load2(block_offsets_guest.y)); + uint4 blocks_23 = uint4(xe_texture_copy_source.Load2(block_offsets_guest.z), + xe_texture_copy_source.Load2(block_offsets_guest.w)); + blocks_01 = XeByteSwap(blocks_01, xe_texture_copy_endianness); + blocks_23 = XeByteSwap(blocks_23, xe_texture_copy_endianness); + + // Sort the color indices so they can be used as weights for the second + // endpoint. Initially 00 = 3:0, 01 = 0:3, 10 = 2:1, 11 = 1:2. + uint4 weights_high = uint4(blocks_01.yw, blocks_23.yw); + // Swap bits. 00 = 3:0, 01 = 2:1, 10 = 0:3, 11 = 1:2. + weights_high = ((weights_high & 0x55555555u) << 1u) | + ((weights_high & 0xAAAAAAAAu) >> 1u); + // Swap 10 and 11. 00 = 3:0, 01 = 2:1, 10 = 1:2, 11 = 0:3. + weights_high ^= ((weights_high & 0xAAAAAAAAu) >> 1u); + + // Unpack the endpoints as: + // 0x00g000r0 0x00g100r1 0x00g200r2 0x00g300r3 + // 0x00G000R0 0x00G100R1 0x00G200R2 0x00G300R3 + // so they can be multiplied by their weights allowing overflow. + uint4 end_packed = uint4(blocks_01.xz, blocks_23.xz); + uint4 end_low_rr00gg00 = + (end_packed & 0xFFu) | ((end_packed & 0xFF00u) << 8u); + uint4 end_high_rr00gg00 = + ((end_packed & 0xFF0000u) >> 16u) | ((end_packed & 0xFF000000u) >> 8u); + + // Uncompress and write the rows. + uint3 texel_index_host = block_index << uint3(2u, 2u, 0u); + uint texel_offset_host = XeTextureHostLinearOffset( + texel_index_host, xe_texture_copy_size_texels.y, + xe_texture_copy_host_pitch, 2u) + xe_texture_copy_host_base; + for (uint i = 0u; i < 4u; ++i) { + uint4 row_01, row_23; + XeCTX1FourBlocksRowToR8G8(weights_high, i * 8u, end_low_rr00gg00, + end_high_rr00gg00, row_01, row_23); + xe_texture_copy_dest.Store4(texel_offset_host, row_01); + xe_texture_copy_dest.Store4(texel_offset_host + 16u, row_23); + if (++texel_index_host.y >= xe_texture_copy_size_texels.y) { + return; + } + texel_offset_host += xe_texture_copy_host_pitch; + } +} diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index 20e439a58..307f0e519 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -29,6 +29,7 @@ namespace d3d12 { #include "xenia/gpu/d3d12/shaders/bin/texture_load_32bpb_cs.h" #include "xenia/gpu/d3d12/shaders/bin/texture_load_64bpb_cs.h" #include "xenia/gpu/d3d12/shaders/bin/texture_load_8bpb_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/texture_load_ctx1_cs.h" const TextureCache::HostFormat TextureCache::host_formats_[64] = { {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_1_REVERSE @@ -92,7 +93,7 @@ const TextureCache::HostFormat TextureCache::host_formats_[64] = { {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_32_32_32_FLOAT {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_DXT3A {DXGI_FORMAT_BC4_UNORM, CopyMode::k64bpb}, // k_DXT5A - {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_CTX1 + {DXGI_FORMAT_R8G8_UNORM, CopyMode::kCTX1}, // k_CTX1 {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_DXT3A_AS_1_1_1_1 {DXGI_FORMAT_R8G8B8A8_UNORM, CopyMode::k32bpb}, // k_8_8_8_8_GAMMA {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_2_10_10_10_FLOAT_EDRAM @@ -107,6 +108,7 @@ const TextureCache::CopyModeInfo TextureCache::copy_mode_info_[] = { {texture_load_32bpb_cs, sizeof(texture_load_32bpb_cs)}, {texture_load_64bpb_cs, sizeof(texture_load_64bpb_cs)}, {texture_load_128bpb_cs, sizeof(texture_load_128bpb_cs)}, + {texture_load_ctx1_cs, sizeof(texture_load_ctx1_cs)}, }; TextureCache::TextureCache(D3D12CommandProcessor* command_processor, @@ -820,11 +822,14 @@ bool TextureCache::LoadTextureData(Texture* texture) { : texture->mip_pitches[j]; copy_constants.host_base = uint32_t(host_layouts[j].Offset); copy_constants.host_pitch = host_layouts[j].Footprint.RowPitch; - copy_constants.size[0] = - (std::max(width >> j, 1u) + (block_width - 1)) / block_width; - copy_constants.size[1] = - (std::max(height >> j, 1u) + (block_height - 1)) / block_height; - copy_constants.size[2] = std::max(depth >> j, 1u); + copy_constants.size_texels[0] = std::max(width >> j, 1u); + copy_constants.size_texels[1] = std::max(height >> j, 1u); + copy_constants.size_texels[2] = std::max(depth >> j, 1u); + copy_constants.size_blocks[0] = + (copy_constants.size_texels[0] + (block_width - 1)) / block_width; + copy_constants.size_blocks[1] = + (copy_constants.size_texels[1] + (block_height - 1)) / block_height; + copy_constants.size_blocks[2] = copy_constants.size_texels[2]; if (texture->key.packed_mips) { texture_util::GetPackedMipOffset(width, height, depth, guest_format, j, copy_constants.guest_mip_offset[0], @@ -843,9 +848,9 @@ bool TextureCache::LoadTextureData(Texture* texture) { std::memcpy(cbuffer_mapping, ©_constants, sizeof(copy_constants)); command_list->SetComputeRootConstantBufferView(0, cbuffer_gpu_address); // Each thread group processes 32x32x1 blocks. - command_list->Dispatch((copy_constants.size[0] + 31) >> 5, - (copy_constants.size[1] + 31) >> 5, - copy_constants.size[2]); + command_list->Dispatch((copy_constants.size_blocks[0] + 31) >> 5, + (copy_constants.size_blocks[1] + 31) >> 5, + copy_constants.size_blocks[2]); } barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_UAV; barriers[0].UAV.pResource = copy_buffer; diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h index 83c81c9dd..f43e6806b 100644 --- a/src/xenia/gpu/d3d12/texture_cache.h +++ b/src/xenia/gpu/d3d12/texture_cache.h @@ -90,6 +90,7 @@ class TextureCache { k32bpb, k64bpb, k128bpb, + kCTX1, kCount, @@ -194,14 +195,16 @@ class TextureCache { uint32_t host_pitch; // vec4 1. - // Size in blocks. - uint32_t size[3]; + uint32_t size_texels[3]; uint32_t is_3d; // vec4 2. + uint32_t size_blocks[3]; + uint32_t endianness; + + // vec4 3. // Offset within the packed mip for small mips. uint32_t guest_mip_offset[3]; - uint32_t endianness; static constexpr uint32_t kGuestPitchTiled = UINT32_MAX; };