[D3D12] CTX1 texture loading shader
This commit is contained in:
parent
17fb60a97a
commit
428095f62a
|
@ -11,13 +11,14 @@ cbuffer xe_texture_copy_constants : register(b0) {
|
|||
uint xe_texture_copy_host_base;
|
||||
uint xe_texture_copy_host_pitch;
|
||||
|
||||
// Size in blocks.
|
||||
uint3 xe_texture_copy_size;
|
||||
uint3 xe_texture_copy_size_texels;
|
||||
bool xe_texture_copy_is_3d;
|
||||
|
||||
uint3 xe_texture_copy_size_blocks;
|
||||
uint xe_texture_copy_endianness;
|
||||
|
||||
// Offset within the packed mip for small mips.
|
||||
uint3 xe_texture_copy_guest_mip_offset;
|
||||
uint xe_texture_copy_endianness;
|
||||
};
|
||||
|
||||
#define XeTextureCopyGuestPitchTiled 0xFFFFFFFFu
|
||||
|
@ -33,15 +34,15 @@ uint4 XeTextureCopyGuestBlockOffsets(uint3 block_index, uint bpb,
|
|||
[branch] if (xe_texture_copy_guest_pitch == XeTextureCopyGuestPitchTiled) {
|
||||
[branch] if (xe_texture_copy_is_3d) {
|
||||
block_offsets_guest = XeTextureTiledOffset3D(
|
||||
block_index_guest, xe_texture_copy_size.xy, bpb_log2);
|
||||
block_index_guest, xe_texture_copy_size_blocks.xy, bpb_log2);
|
||||
} else {
|
||||
block_offsets_guest = XeTextureTiledOffset2D(
|
||||
block_index_guest.xy, xe_texture_copy_size.x, bpb_log2);
|
||||
block_index_guest.xy, xe_texture_copy_size_blocks.x, bpb_log2);
|
||||
}
|
||||
} else {
|
||||
block_offsets_guest =
|
||||
uint4(0u, 1u, 2u, 3u) * bpb + XeTextureGuestLinearOffset(
|
||||
block_index_guest, xe_texture_copy_size.y,
|
||||
block_index_guest, xe_texture_copy_size_blocks.y,
|
||||
xe_texture_copy_guest_pitch, 16u);
|
||||
}
|
||||
return block_offsets_guest + xe_texture_copy_guest_base;
|
||||
|
|
|
@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
|
|||
// 1 thread = 4 uint4 blocks.
|
||||
uint3 block_index = xe_thread_id;
|
||||
block_index.x <<= 2u;
|
||||
[branch] if (any(block_index >= xe_texture_copy_size)) {
|
||||
[branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
|
||||
return;
|
||||
}
|
||||
uint4 block_offsets_guest =
|
||||
|
@ -19,8 +19,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
|
|||
block_2 = XeByteSwap(block_2, xe_texture_copy_endianness);
|
||||
block_3 = XeByteSwap(block_3, xe_texture_copy_endianness);
|
||||
uint block_offset_host = XeTextureHostLinearOffset(
|
||||
block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 16u) +
|
||||
xe_texture_copy_host_base;
|
||||
block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
|
||||
16u) + xe_texture_copy_host_base;
|
||||
uint4 block_offsets_host = uint4(0u, 16u, 32u, 48u) + block_offset_host;
|
||||
xe_texture_copy_dest.Store4(block_offsets_host.x, block_0);
|
||||
xe_texture_copy_dest.Store4(block_offsets_host.y, block_1);
|
||||
|
|
|
@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
|
|||
// 1 thread = 4 ushort blocks.
|
||||
uint3 block_index = xe_thread_id;
|
||||
block_index.x <<= 2u;
|
||||
[branch] if (any(block_index >= xe_texture_copy_size)) {
|
||||
[branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
|
||||
return;
|
||||
}
|
||||
uint4 block_offsets_guest =
|
||||
|
@ -18,8 +18,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
|
|||
blocks = (blocks >> ((block_offsets_guest & 2u) << 3u)) & 0xFFFFu;
|
||||
blocks = XeByteSwap16(blocks, xe_texture_copy_endianness);
|
||||
uint block_offset_host = XeTextureHostLinearOffset(
|
||||
block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 2u) +
|
||||
xe_texture_copy_host_base;
|
||||
block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
|
||||
2u) + xe_texture_copy_host_base;
|
||||
xe_texture_copy_dest.Store2(block_offset_host,
|
||||
blocks.xz | (blocks.yw << 16u));
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
|
|||
// 1 thread = 4 uint blocks.
|
||||
uint3 block_index = xe_thread_id;
|
||||
block_index.x <<= 2u;
|
||||
[branch] if (any(block_index >= xe_texture_copy_size)) {
|
||||
[branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
|
||||
return;
|
||||
}
|
||||
uint4 block_offsets_guest =
|
||||
|
@ -16,7 +16,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
|
|||
xe_texture_copy_source.Load(block_offsets_guest.w));
|
||||
blocks = XeByteSwap(blocks, xe_texture_copy_endianness);
|
||||
uint block_offset_host = XeTextureHostLinearOffset(
|
||||
block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 4u) +
|
||||
xe_texture_copy_host_base;
|
||||
block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
|
||||
4u) + xe_texture_copy_host_base;
|
||||
xe_texture_copy_dest.Store4(block_offset_host, blocks);
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
|
|||
// 1 thread = 4 uint2 blocks.
|
||||
uint3 block_index = xe_thread_id;
|
||||
block_index.x <<= 2u;
|
||||
[branch] if (any(block_index >= xe_texture_copy_size)) {
|
||||
[branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
|
||||
return;
|
||||
}
|
||||
uint4 block_offsets_guest =
|
||||
|
@ -17,8 +17,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
|
|||
blocks_01 = XeByteSwap(blocks_01, xe_texture_copy_endianness);
|
||||
blocks_23 = XeByteSwap(blocks_23, xe_texture_copy_endianness);
|
||||
uint block_offset_host = XeTextureHostLinearOffset(
|
||||
block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 8u) +
|
||||
xe_texture_copy_host_base;
|
||||
block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
|
||||
8u) + xe_texture_copy_host_base;
|
||||
xe_texture_copy_dest.Store4(block_offset_host, blocks_01);
|
||||
xe_texture_copy_dest.Store4(block_offset_host + 16u, blocks_23);
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
|
|||
// 1 thread = 4 ubyte blocks.
|
||||
uint3 block_index = xe_thread_id;
|
||||
block_index.x <<= 2u;
|
||||
[branch] if (any(block_index >= xe_texture_copy_size)) {
|
||||
[branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
|
||||
return;
|
||||
}
|
||||
uint4 block_offsets_guest =
|
||||
|
@ -20,7 +20,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
|
|||
blocks.xy |= blocks.zw;
|
||||
blocks.x |= blocks.y;
|
||||
uint block_offset_host = XeTextureHostLinearOffset(
|
||||
block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 1u) +
|
||||
xe_texture_copy_host_base;
|
||||
block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
|
||||
1u) + xe_texture_copy_host_base;
|
||||
xe_texture_copy_dest.Store(block_offset_host, blocks.x);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
#include "texture_copy.hlsli"
|
||||
|
||||
// http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
|
||||
// CXT1 is like DXT3/5 color, but 2-component and with 8:8 endpoints rather than
|
||||
// 5:6:5.
|
||||
//
|
||||
// Dword 1:
|
||||
// rrrrrrrr gggggggg
|
||||
// RRRRRRRR GGGGGGGG
|
||||
// Dword 2:
|
||||
// AA BB CC DD
|
||||
// EE FF GG HH
|
||||
// II JJ KK LL
|
||||
// MM NN OO PP
|
||||
|
||||
void XeCTX1FourBlocksRowToR8G8(uint4 weights_high, uint weights_shift,
|
||||
uint4 end_low_rr00gg00, uint4 end_high_rr00gg00,
|
||||
out uint4 row_01, out uint4 row_23) {
|
||||
uint4 weights_low = ~weights_high;
|
||||
uint4 weights_shifts = uint4(0u, 2u, 4u, 6u) + weights_shift;
|
||||
uint4 row_3aaaa =
|
||||
((weights_low >> weights_shifts.x) & 3u) * end_low_rr00gg00 +
|
||||
((weights_high >> weights_shifts.x) & 3u) * end_high_rr00gg00;
|
||||
uint4 row_3bbbb =
|
||||
((weights_low >> weights_shifts.y) & 3u) * end_low_rr00gg00 +
|
||||
((weights_high >> weights_shifts.y) & 3u) * end_high_rr00gg00;
|
||||
uint4 row_3cccc =
|
||||
((weights_low >> weights_shifts.z) & 3u) * end_low_rr00gg00 +
|
||||
((weights_high >> weights_shifts.z) & 3u) * end_high_rr00gg00;
|
||||
uint4 row_3dddd =
|
||||
((weights_low >> weights_shifts.w) & 3u) * end_low_rr00gg00 +
|
||||
((weights_high >> weights_shifts.w) & 3u) * end_high_rr00gg00;
|
||||
uint4 row_half_3acac = uint4(row_3aaaa.xy, row_3cccc.xy).xzyw;
|
||||
uint4 row_half_3bdbd = uint4(row_3bbbb.xy, row_3dddd.xy).xzyw;
|
||||
// R0A G0A R0B G0B | R0C G0C R0D G0D | R1A G1A R1B G1B | R1C G1C R1D G1D
|
||||
row_01 = ((row_half_3acac & 0xFFFFu) / 3u) |
|
||||
(((row_half_3acac >> 16u) / 3u) << 8u) |
|
||||
(((row_half_3bdbd & 0xFFFFu) / 3u) << 16u) |
|
||||
(((row_half_3bdbd >> 16u) / 3u) << 24u);
|
||||
row_half_3acac = uint4(row_3aaaa.zw, row_3cccc.zw).xzyw;
|
||||
row_half_3bdbd = uint4(row_3bbbb.zw, row_3dddd.zw).xzyw;
|
||||
// R2A G2A R2B G2B | R2C G2C R2D G2D | R3A G3A R3B G3B | R3C G3C R3D G3D
|
||||
row_23 = ((row_half_3acac & 0xFFFFu) / 3u) |
|
||||
(((row_half_3acac >> 16u) / 3u) << 8u) |
|
||||
(((row_half_3bdbd & 0xFFFFu) / 3u) << 16u) |
|
||||
(((row_half_3bdbd >> 16u) / 3u) << 24u);
|
||||
}
|
||||
|
||||
[numthreads(8, 32, 1)]
|
||||
void main(uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
// 1 thread = 4 CTX1 (8bpb) blocks to 16x4 R8G8 texels.
|
||||
uint3 block_index = xe_thread_id;
|
||||
block_index.x <<= 2u;
|
||||
[branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
|
||||
return;
|
||||
}
|
||||
uint4 block_offsets_guest =
|
||||
XeTextureCopyGuestBlockOffsets(block_index, 8u, 3u);
|
||||
uint4 blocks_01 = uint4(xe_texture_copy_source.Load2(block_offsets_guest.x),
|
||||
xe_texture_copy_source.Load2(block_offsets_guest.y));
|
||||
uint4 blocks_23 = uint4(xe_texture_copy_source.Load2(block_offsets_guest.z),
|
||||
xe_texture_copy_source.Load2(block_offsets_guest.w));
|
||||
blocks_01 = XeByteSwap(blocks_01, xe_texture_copy_endianness);
|
||||
blocks_23 = XeByteSwap(blocks_23, xe_texture_copy_endianness);
|
||||
|
||||
// Sort the color indices so they can be used as weights for the second
|
||||
// endpoint. Initially 00 = 3:0, 01 = 0:3, 10 = 2:1, 11 = 1:2.
|
||||
uint4 weights_high = uint4(blocks_01.yw, blocks_23.yw);
|
||||
// Swap bits. 00 = 3:0, 01 = 2:1, 10 = 0:3, 11 = 1:2.
|
||||
weights_high = ((weights_high & 0x55555555u) << 1u) |
|
||||
((weights_high & 0xAAAAAAAAu) >> 1u);
|
||||
// Swap 10 and 11. 00 = 3:0, 01 = 2:1, 10 = 1:2, 11 = 0:3.
|
||||
weights_high ^= ((weights_high & 0xAAAAAAAAu) >> 1u);
|
||||
|
||||
// Unpack the endpoints as:
|
||||
// 0x00g000r0 0x00g100r1 0x00g200r2 0x00g300r3
|
||||
// 0x00G000R0 0x00G100R1 0x00G200R2 0x00G300R3
|
||||
// so they can be multiplied by their weights allowing overflow.
|
||||
uint4 end_packed = uint4(blocks_01.xz, blocks_23.xz);
|
||||
uint4 end_low_rr00gg00 =
|
||||
(end_packed & 0xFFu) | ((end_packed & 0xFF00u) << 8u);
|
||||
uint4 end_high_rr00gg00 =
|
||||
((end_packed & 0xFF0000u) >> 16u) | ((end_packed & 0xFF000000u) >> 8u);
|
||||
|
||||
// Uncompress and write the rows.
|
||||
uint3 texel_index_host = block_index << uint3(2u, 2u, 0u);
|
||||
uint texel_offset_host = XeTextureHostLinearOffset(
|
||||
texel_index_host, xe_texture_copy_size_texels.y,
|
||||
xe_texture_copy_host_pitch, 2u) + xe_texture_copy_host_base;
|
||||
for (uint i = 0u; i < 4u; ++i) {
|
||||
uint4 row_01, row_23;
|
||||
XeCTX1FourBlocksRowToR8G8(weights_high, i * 8u, end_low_rr00gg00,
|
||||
end_high_rr00gg00, row_01, row_23);
|
||||
xe_texture_copy_dest.Store4(texel_offset_host, row_01);
|
||||
xe_texture_copy_dest.Store4(texel_offset_host + 16u, row_23);
|
||||
if (++texel_index_host.y >= xe_texture_copy_size_texels.y) {
|
||||
return;
|
||||
}
|
||||
texel_offset_host += xe_texture_copy_host_pitch;
|
||||
}
|
||||
}
|
|
@ -29,6 +29,7 @@ namespace d3d12 {
|
|||
#include "xenia/gpu/d3d12/shaders/bin/texture_load_32bpb_cs.h"
|
||||
#include "xenia/gpu/d3d12/shaders/bin/texture_load_64bpb_cs.h"
|
||||
#include "xenia/gpu/d3d12/shaders/bin/texture_load_8bpb_cs.h"
|
||||
#include "xenia/gpu/d3d12/shaders/bin/texture_load_ctx1_cs.h"
|
||||
|
||||
const TextureCache::HostFormat TextureCache::host_formats_[64] = {
|
||||
{DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_1_REVERSE
|
||||
|
@ -92,7 +93,7 @@ const TextureCache::HostFormat TextureCache::host_formats_[64] = {
|
|||
{DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_32_32_32_FLOAT
|
||||
{DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_DXT3A
|
||||
{DXGI_FORMAT_BC4_UNORM, CopyMode::k64bpb}, // k_DXT5A
|
||||
{DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_CTX1
|
||||
{DXGI_FORMAT_R8G8_UNORM, CopyMode::kCTX1}, // k_CTX1
|
||||
{DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_DXT3A_AS_1_1_1_1
|
||||
{DXGI_FORMAT_R8G8B8A8_UNORM, CopyMode::k32bpb}, // k_8_8_8_8_GAMMA
|
||||
{DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_2_10_10_10_FLOAT_EDRAM
|
||||
|
@ -107,6 +108,7 @@ const TextureCache::CopyModeInfo TextureCache::copy_mode_info_[] = {
|
|||
{texture_load_32bpb_cs, sizeof(texture_load_32bpb_cs)},
|
||||
{texture_load_64bpb_cs, sizeof(texture_load_64bpb_cs)},
|
||||
{texture_load_128bpb_cs, sizeof(texture_load_128bpb_cs)},
|
||||
{texture_load_ctx1_cs, sizeof(texture_load_ctx1_cs)},
|
||||
};
|
||||
|
||||
TextureCache::TextureCache(D3D12CommandProcessor* command_processor,
|
||||
|
@ -820,11 +822,14 @@ bool TextureCache::LoadTextureData(Texture* texture) {
|
|||
: texture->mip_pitches[j];
|
||||
copy_constants.host_base = uint32_t(host_layouts[j].Offset);
|
||||
copy_constants.host_pitch = host_layouts[j].Footprint.RowPitch;
|
||||
copy_constants.size[0] =
|
||||
(std::max(width >> j, 1u) + (block_width - 1)) / block_width;
|
||||
copy_constants.size[1] =
|
||||
(std::max(height >> j, 1u) + (block_height - 1)) / block_height;
|
||||
copy_constants.size[2] = std::max(depth >> j, 1u);
|
||||
copy_constants.size_texels[0] = std::max(width >> j, 1u);
|
||||
copy_constants.size_texels[1] = std::max(height >> j, 1u);
|
||||
copy_constants.size_texels[2] = std::max(depth >> j, 1u);
|
||||
copy_constants.size_blocks[0] =
|
||||
(copy_constants.size_texels[0] + (block_width - 1)) / block_width;
|
||||
copy_constants.size_blocks[1] =
|
||||
(copy_constants.size_texels[1] + (block_height - 1)) / block_height;
|
||||
copy_constants.size_blocks[2] = copy_constants.size_texels[2];
|
||||
if (texture->key.packed_mips) {
|
||||
texture_util::GetPackedMipOffset(width, height, depth, guest_format, j,
|
||||
copy_constants.guest_mip_offset[0],
|
||||
|
@ -843,9 +848,9 @@ bool TextureCache::LoadTextureData(Texture* texture) {
|
|||
std::memcpy(cbuffer_mapping, ©_constants, sizeof(copy_constants));
|
||||
command_list->SetComputeRootConstantBufferView(0, cbuffer_gpu_address);
|
||||
// Each thread group processes 32x32x1 blocks.
|
||||
command_list->Dispatch((copy_constants.size[0] + 31) >> 5,
|
||||
(copy_constants.size[1] + 31) >> 5,
|
||||
copy_constants.size[2]);
|
||||
command_list->Dispatch((copy_constants.size_blocks[0] + 31) >> 5,
|
||||
(copy_constants.size_blocks[1] + 31) >> 5,
|
||||
copy_constants.size_blocks[2]);
|
||||
}
|
||||
barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_UAV;
|
||||
barriers[0].UAV.pResource = copy_buffer;
|
||||
|
|
|
@ -90,6 +90,7 @@ class TextureCache {
|
|||
k32bpb,
|
||||
k64bpb,
|
||||
k128bpb,
|
||||
kCTX1,
|
||||
|
||||
kCount,
|
||||
|
||||
|
@ -194,14 +195,16 @@ class TextureCache {
|
|||
uint32_t host_pitch;
|
||||
|
||||
// vec4 1.
|
||||
// Size in blocks.
|
||||
uint32_t size[3];
|
||||
uint32_t size_texels[3];
|
||||
uint32_t is_3d;
|
||||
|
||||
// vec4 2.
|
||||
uint32_t size_blocks[3];
|
||||
uint32_t endianness;
|
||||
|
||||
// vec4 3.
|
||||
// Offset within the packed mip for small mips.
|
||||
uint32_t guest_mip_offset[3];
|
||||
uint32_t endianness;
|
||||
|
||||
static constexpr uint32_t kGuestPitchTiled = UINT32_MAX;
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue