[D3D12] CTX1 texture loading shader

This commit is contained in:
Triang3l 2018-08-15 16:27:13 +03:00
parent 17fb60a97a
commit 428095f62a
9 changed files with 143 additions and 33 deletions

View File

@ -11,13 +11,14 @@ cbuffer xe_texture_copy_constants : register(b0) {
uint xe_texture_copy_host_base;
uint xe_texture_copy_host_pitch;
// Size in blocks.
uint3 xe_texture_copy_size;
uint3 xe_texture_copy_size_texels;
bool xe_texture_copy_is_3d;
uint3 xe_texture_copy_size_blocks;
uint xe_texture_copy_endianness;
// Offset within the packed mip for small mips.
uint3 xe_texture_copy_guest_mip_offset;
uint xe_texture_copy_endianness;
};
#define XeTextureCopyGuestPitchTiled 0xFFFFFFFFu
@ -33,15 +34,15 @@ uint4 XeTextureCopyGuestBlockOffsets(uint3 block_index, uint bpb,
[branch] if (xe_texture_copy_guest_pitch == XeTextureCopyGuestPitchTiled) {
[branch] if (xe_texture_copy_is_3d) {
block_offsets_guest = XeTextureTiledOffset3D(
block_index_guest, xe_texture_copy_size.xy, bpb_log2);
block_index_guest, xe_texture_copy_size_blocks.xy, bpb_log2);
} else {
block_offsets_guest = XeTextureTiledOffset2D(
block_index_guest.xy, xe_texture_copy_size.x, bpb_log2);
block_index_guest.xy, xe_texture_copy_size_blocks.x, bpb_log2);
}
} else {
block_offsets_guest =
uint4(0u, 1u, 2u, 3u) * bpb + XeTextureGuestLinearOffset(
block_index_guest, xe_texture_copy_size.y,
block_index_guest, xe_texture_copy_size_blocks.y,
xe_texture_copy_guest_pitch, 16u);
}
return block_offsets_guest + xe_texture_copy_guest_base;

View File

@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
// 1 thread = 4 uint4 blocks.
uint3 block_index = xe_thread_id;
block_index.x <<= 2u;
[branch] if (any(block_index >= xe_texture_copy_size)) {
[branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
return;
}
uint4 block_offsets_guest =
@ -19,8 +19,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
block_2 = XeByteSwap(block_2, xe_texture_copy_endianness);
block_3 = XeByteSwap(block_3, xe_texture_copy_endianness);
uint block_offset_host = XeTextureHostLinearOffset(
block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 16u) +
xe_texture_copy_host_base;
block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
16u) + xe_texture_copy_host_base;
uint4 block_offsets_host = uint4(0u, 16u, 32u, 48u) + block_offset_host;
xe_texture_copy_dest.Store4(block_offsets_host.x, block_0);
xe_texture_copy_dest.Store4(block_offsets_host.y, block_1);

View File

@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
// 1 thread = 4 ushort blocks.
uint3 block_index = xe_thread_id;
block_index.x <<= 2u;
[branch] if (any(block_index >= xe_texture_copy_size)) {
[branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
return;
}
uint4 block_offsets_guest =
@ -18,8 +18,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
blocks = (blocks >> ((block_offsets_guest & 2u) << 3u)) & 0xFFFFu;
blocks = XeByteSwap16(blocks, xe_texture_copy_endianness);
uint block_offset_host = XeTextureHostLinearOffset(
block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 2u) +
xe_texture_copy_host_base;
block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
2u) + xe_texture_copy_host_base;
xe_texture_copy_dest.Store2(block_offset_host,
blocks.xz | (blocks.yw << 16u));
}

View File

@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
// 1 thread = 4 uint blocks.
uint3 block_index = xe_thread_id;
block_index.x <<= 2u;
[branch] if (any(block_index >= xe_texture_copy_size)) {
[branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
return;
}
uint4 block_offsets_guest =
@ -16,7 +16,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
xe_texture_copy_source.Load(block_offsets_guest.w));
blocks = XeByteSwap(blocks, xe_texture_copy_endianness);
uint block_offset_host = XeTextureHostLinearOffset(
block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 4u) +
xe_texture_copy_host_base;
block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
4u) + xe_texture_copy_host_base;
xe_texture_copy_dest.Store4(block_offset_host, blocks);
}

View File

@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
// 1 thread = 4 uint2 blocks.
uint3 block_index = xe_thread_id;
block_index.x <<= 2u;
[branch] if (any(block_index >= xe_texture_copy_size)) {
[branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
return;
}
uint4 block_offsets_guest =
@ -17,8 +17,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
blocks_01 = XeByteSwap(blocks_01, xe_texture_copy_endianness);
blocks_23 = XeByteSwap(blocks_23, xe_texture_copy_endianness);
uint block_offset_host = XeTextureHostLinearOffset(
block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 8u) +
xe_texture_copy_host_base;
block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
8u) + xe_texture_copy_host_base;
xe_texture_copy_dest.Store4(block_offset_host, blocks_01);
xe_texture_copy_dest.Store4(block_offset_host + 16u, blocks_23);
}

View File

@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
// 1 thread = 4 ubyte blocks.
uint3 block_index = xe_thread_id;
block_index.x <<= 2u;
[branch] if (any(block_index >= xe_texture_copy_size)) {
[branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
return;
}
uint4 block_offsets_guest =
@ -20,7 +20,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
blocks.xy |= blocks.zw;
blocks.x |= blocks.y;
uint block_offset_host = XeTextureHostLinearOffset(
block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 1u) +
xe_texture_copy_host_base;
block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
1u) + xe_texture_copy_host_base;
xe_texture_copy_dest.Store(block_offset_host, blocks.x);
}

View File

@ -0,0 +1,101 @@
#include "texture_copy.hlsli"
// http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
// CXT1 is like DXT3/5 color, but 2-component and with 8:8 endpoints rather than
// 5:6:5.
//
// Dword 1:
// rrrrrrrr gggggggg
// RRRRRRRR GGGGGGGG
// Dword 2:
// AA BB CC DD
// EE FF GG HH
// II JJ KK LL
// MM NN OO PP
void XeCTX1FourBlocksRowToR8G8(uint4 weights_high, uint weights_shift,
uint4 end_low_rr00gg00, uint4 end_high_rr00gg00,
out uint4 row_01, out uint4 row_23) {
uint4 weights_low = ~weights_high;
uint4 weights_shifts = uint4(0u, 2u, 4u, 6u) + weights_shift;
uint4 row_3aaaa =
((weights_low >> weights_shifts.x) & 3u) * end_low_rr00gg00 +
((weights_high >> weights_shifts.x) & 3u) * end_high_rr00gg00;
uint4 row_3bbbb =
((weights_low >> weights_shifts.y) & 3u) * end_low_rr00gg00 +
((weights_high >> weights_shifts.y) & 3u) * end_high_rr00gg00;
uint4 row_3cccc =
((weights_low >> weights_shifts.z) & 3u) * end_low_rr00gg00 +
((weights_high >> weights_shifts.z) & 3u) * end_high_rr00gg00;
uint4 row_3dddd =
((weights_low >> weights_shifts.w) & 3u) * end_low_rr00gg00 +
((weights_high >> weights_shifts.w) & 3u) * end_high_rr00gg00;
uint4 row_half_3acac = uint4(row_3aaaa.xy, row_3cccc.xy).xzyw;
uint4 row_half_3bdbd = uint4(row_3bbbb.xy, row_3dddd.xy).xzyw;
// R0A G0A R0B G0B | R0C G0C R0D G0D | R1A G1A R1B G1B | R1C G1C R1D G1D
row_01 = ((row_half_3acac & 0xFFFFu) / 3u) |
(((row_half_3acac >> 16u) / 3u) << 8u) |
(((row_half_3bdbd & 0xFFFFu) / 3u) << 16u) |
(((row_half_3bdbd >> 16u) / 3u) << 24u);
row_half_3acac = uint4(row_3aaaa.zw, row_3cccc.zw).xzyw;
row_half_3bdbd = uint4(row_3bbbb.zw, row_3dddd.zw).xzyw;
// R2A G2A R2B G2B | R2C G2C R2D G2D | R3A G3A R3B G3B | R3C G3C R3D G3D
row_23 = ((row_half_3acac & 0xFFFFu) / 3u) |
(((row_half_3acac >> 16u) / 3u) << 8u) |
(((row_half_3bdbd & 0xFFFFu) / 3u) << 16u) |
(((row_half_3bdbd >> 16u) / 3u) << 24u);
}
[numthreads(8, 32, 1)]
void main(uint3 xe_thread_id : SV_DispatchThreadID) {
// 1 thread = 4 CTX1 (8bpb) blocks to 16x4 R8G8 texels.
uint3 block_index = xe_thread_id;
block_index.x <<= 2u;
[branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
return;
}
uint4 block_offsets_guest =
XeTextureCopyGuestBlockOffsets(block_index, 8u, 3u);
uint4 blocks_01 = uint4(xe_texture_copy_source.Load2(block_offsets_guest.x),
xe_texture_copy_source.Load2(block_offsets_guest.y));
uint4 blocks_23 = uint4(xe_texture_copy_source.Load2(block_offsets_guest.z),
xe_texture_copy_source.Load2(block_offsets_guest.w));
blocks_01 = XeByteSwap(blocks_01, xe_texture_copy_endianness);
blocks_23 = XeByteSwap(blocks_23, xe_texture_copy_endianness);
// Sort the color indices so they can be used as weights for the second
// endpoint. Initially 00 = 3:0, 01 = 0:3, 10 = 2:1, 11 = 1:2.
uint4 weights_high = uint4(blocks_01.yw, blocks_23.yw);
// Swap bits. 00 = 3:0, 01 = 2:1, 10 = 0:3, 11 = 1:2.
weights_high = ((weights_high & 0x55555555u) << 1u) |
((weights_high & 0xAAAAAAAAu) >> 1u);
// Swap 10 and 11. 00 = 3:0, 01 = 2:1, 10 = 1:2, 11 = 0:3.
weights_high ^= ((weights_high & 0xAAAAAAAAu) >> 1u);
// Unpack the endpoints as:
// 0x00g000r0 0x00g100r1 0x00g200r2 0x00g300r3
// 0x00G000R0 0x00G100R1 0x00G200R2 0x00G300R3
// so they can be multiplied by their weights allowing overflow.
uint4 end_packed = uint4(blocks_01.xz, blocks_23.xz);
uint4 end_low_rr00gg00 =
(end_packed & 0xFFu) | ((end_packed & 0xFF00u) << 8u);
uint4 end_high_rr00gg00 =
((end_packed & 0xFF0000u) >> 16u) | ((end_packed & 0xFF000000u) >> 8u);
// Uncompress and write the rows.
uint3 texel_index_host = block_index << uint3(2u, 2u, 0u);
uint texel_offset_host = XeTextureHostLinearOffset(
texel_index_host, xe_texture_copy_size_texels.y,
xe_texture_copy_host_pitch, 2u) + xe_texture_copy_host_base;
for (uint i = 0u; i < 4u; ++i) {
uint4 row_01, row_23;
XeCTX1FourBlocksRowToR8G8(weights_high, i * 8u, end_low_rr00gg00,
end_high_rr00gg00, row_01, row_23);
xe_texture_copy_dest.Store4(texel_offset_host, row_01);
xe_texture_copy_dest.Store4(texel_offset_host + 16u, row_23);
if (++texel_index_host.y >= xe_texture_copy_size_texels.y) {
return;
}
texel_offset_host += xe_texture_copy_host_pitch;
}
}

View File

@ -29,6 +29,7 @@ namespace d3d12 {
#include "xenia/gpu/d3d12/shaders/bin/texture_load_32bpb_cs.h"
#include "xenia/gpu/d3d12/shaders/bin/texture_load_64bpb_cs.h"
#include "xenia/gpu/d3d12/shaders/bin/texture_load_8bpb_cs.h"
#include "xenia/gpu/d3d12/shaders/bin/texture_load_ctx1_cs.h"
const TextureCache::HostFormat TextureCache::host_formats_[64] = {
{DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_1_REVERSE
@ -92,7 +93,7 @@ const TextureCache::HostFormat TextureCache::host_formats_[64] = {
{DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_32_32_32_FLOAT
{DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_DXT3A
{DXGI_FORMAT_BC4_UNORM, CopyMode::k64bpb}, // k_DXT5A
{DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_CTX1
{DXGI_FORMAT_R8G8_UNORM, CopyMode::kCTX1}, // k_CTX1
{DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_DXT3A_AS_1_1_1_1
{DXGI_FORMAT_R8G8B8A8_UNORM, CopyMode::k32bpb}, // k_8_8_8_8_GAMMA
{DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_2_10_10_10_FLOAT_EDRAM
@ -107,6 +108,7 @@ const TextureCache::CopyModeInfo TextureCache::copy_mode_info_[] = {
{texture_load_32bpb_cs, sizeof(texture_load_32bpb_cs)},
{texture_load_64bpb_cs, sizeof(texture_load_64bpb_cs)},
{texture_load_128bpb_cs, sizeof(texture_load_128bpb_cs)},
{texture_load_ctx1_cs, sizeof(texture_load_ctx1_cs)},
};
TextureCache::TextureCache(D3D12CommandProcessor* command_processor,
@ -820,11 +822,14 @@ bool TextureCache::LoadTextureData(Texture* texture) {
: texture->mip_pitches[j];
copy_constants.host_base = uint32_t(host_layouts[j].Offset);
copy_constants.host_pitch = host_layouts[j].Footprint.RowPitch;
copy_constants.size[0] =
(std::max(width >> j, 1u) + (block_width - 1)) / block_width;
copy_constants.size[1] =
(std::max(height >> j, 1u) + (block_height - 1)) / block_height;
copy_constants.size[2] = std::max(depth >> j, 1u);
copy_constants.size_texels[0] = std::max(width >> j, 1u);
copy_constants.size_texels[1] = std::max(height >> j, 1u);
copy_constants.size_texels[2] = std::max(depth >> j, 1u);
copy_constants.size_blocks[0] =
(copy_constants.size_texels[0] + (block_width - 1)) / block_width;
copy_constants.size_blocks[1] =
(copy_constants.size_texels[1] + (block_height - 1)) / block_height;
copy_constants.size_blocks[2] = copy_constants.size_texels[2];
if (texture->key.packed_mips) {
texture_util::GetPackedMipOffset(width, height, depth, guest_format, j,
copy_constants.guest_mip_offset[0],
@ -843,9 +848,9 @@ bool TextureCache::LoadTextureData(Texture* texture) {
std::memcpy(cbuffer_mapping, &copy_constants, sizeof(copy_constants));
command_list->SetComputeRootConstantBufferView(0, cbuffer_gpu_address);
// Each thread group processes 32x32x1 blocks.
command_list->Dispatch((copy_constants.size[0] + 31) >> 5,
(copy_constants.size[1] + 31) >> 5,
copy_constants.size[2]);
command_list->Dispatch((copy_constants.size_blocks[0] + 31) >> 5,
(copy_constants.size_blocks[1] + 31) >> 5,
copy_constants.size_blocks[2]);
}
barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_UAV;
barriers[0].UAV.pResource = copy_buffer;

View File

@ -90,6 +90,7 @@ class TextureCache {
k32bpb,
k64bpb,
k128bpb,
kCTX1,
kCount,
@ -194,14 +195,16 @@ class TextureCache {
uint32_t host_pitch;
// vec4 1.
// Size in blocks.
uint32_t size[3];
uint32_t size_texels[3];
uint32_t is_3d;
// vec4 2.
uint32_t size_blocks[3];
uint32_t endianness;
// vec4 3.
// Offset within the packed mip for small mips.
uint32_t guest_mip_offset[3];
uint32_t endianness;
static constexpr uint32_t kGuestPitchTiled = UINT32_MAX;
};