[D3D12] CTX1 texture loading shader

2018-08-15 16:27:13 +03:00 · 2018-08-15 16:27:13 +03:00 · 428095f62a
parent 17fb60a97a
commit 428095f62a
9 changed files with 143 additions and 33 deletions
--- a/src/xenia/gpu/d3d12/shaders/texture_copy.hlsli
+++ b/src/xenia/gpu/d3d12/shaders/texture_copy.hlsli
@ -11,13 +11,14 @@ cbuffer xe_texture_copy_constants : register(b0) {
  uint xe_texture_copy_host_base;
  uint xe_texture_copy_host_pitch;

-  // Size in blocks.
-  uint3 xe_texture_copy_size;
+  uint3 xe_texture_copy_size_texels;
  bool xe_texture_copy_is_3d;

+  uint3 xe_texture_copy_size_blocks;
+  uint xe_texture_copy_endianness;
+
  // Offset within the packed mip for small mips.
  uint3 xe_texture_copy_guest_mip_offset;
-  uint xe_texture_copy_endianness;
 };

 #define XeTextureCopyGuestPitchTiled 0xFFFFFFFFu
@ -33,15 +34,15 @@ uint4 XeTextureCopyGuestBlockOffsets(uint3 block_index, uint bpb,
  [branch] if (xe_texture_copy_guest_pitch == XeTextureCopyGuestPitchTiled) {
    [branch] if (xe_texture_copy_is_3d) {
      block_offsets_guest = XeTextureTiledOffset3D(
-          block_index_guest, xe_texture_copy_size.xy, bpb_log2);
+          block_index_guest, xe_texture_copy_size_blocks.xy, bpb_log2);
    } else {
      block_offsets_guest = XeTextureTiledOffset2D(
-          block_index_guest.xy, xe_texture_copy_size.x, bpb_log2);
+          block_index_guest.xy, xe_texture_copy_size_blocks.x, bpb_log2);
    }
  } else {
    block_offsets_guest =
        uint4(0u, 1u, 2u, 3u) * bpb + XeTextureGuestLinearOffset(
-            block_index_guest, xe_texture_copy_size.y,
+            block_index_guest, xe_texture_copy_size_blocks.y,
            xe_texture_copy_guest_pitch, 16u);
  }
  return block_offsets_guest + xe_texture_copy_guest_base;
--- a/src/xenia/gpu/d3d12/shaders/texture_load_128bpb.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/texture_load_128bpb.cs.hlsl
@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
  // 1 thread = 4 uint4 blocks.
  uint3 block_index = xe_thread_id;
  block_index.x <<= 2u;
-  [branch] if (any(block_index >= xe_texture_copy_size)) {
+  [branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
    return;
  }
  uint4 block_offsets_guest =
@ -19,8 +19,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
  block_2 = XeByteSwap(block_2, xe_texture_copy_endianness);
  block_3 = XeByteSwap(block_3, xe_texture_copy_endianness);
  uint block_offset_host = XeTextureHostLinearOffset(
-      block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 16u) +
-      xe_texture_copy_host_base;
+      block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
+      16u) + xe_texture_copy_host_base;
  uint4 block_offsets_host = uint4(0u, 16u, 32u, 48u) + block_offset_host;
  xe_texture_copy_dest.Store4(block_offsets_host.x, block_0);
  xe_texture_copy_dest.Store4(block_offsets_host.y, block_1);
--- a/src/xenia/gpu/d3d12/shaders/texture_load_16bpb.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/texture_load_16bpb.cs.hlsl
@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
  // 1 thread = 4 ushort blocks.
  uint3 block_index = xe_thread_id;
  block_index.x <<= 2u;
-  [branch] if (any(block_index >= xe_texture_copy_size)) {
+  [branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
    return;
  }
  uint4 block_offsets_guest =
@ -18,8 +18,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
  blocks = (blocks >> ((block_offsets_guest & 2u) << 3u)) & 0xFFFFu;
  blocks = XeByteSwap16(blocks, xe_texture_copy_endianness);
  uint block_offset_host = XeTextureHostLinearOffset(
-      block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 2u) +
-      xe_texture_copy_host_base;
+      block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
+      2u) + xe_texture_copy_host_base;
  xe_texture_copy_dest.Store2(block_offset_host,
                              blocks.xz | (blocks.yw << 16u));
 }
--- a/src/xenia/gpu/d3d12/shaders/texture_load_32bpb.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/texture_load_32bpb.cs.hlsl
@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
  // 1 thread = 4 uint blocks.
  uint3 block_index = xe_thread_id;
  block_index.x <<= 2u;
-  [branch] if (any(block_index >= xe_texture_copy_size)) {
+  [branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
    return;
  }
  uint4 block_offsets_guest =
@ -16,7 +16,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
                       xe_texture_copy_source.Load(block_offsets_guest.w));
  blocks = XeByteSwap(blocks, xe_texture_copy_endianness);
  uint block_offset_host = XeTextureHostLinearOffset(
-      block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 4u) +
-      xe_texture_copy_host_base;
+      block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
+      4u) + xe_texture_copy_host_base;
  xe_texture_copy_dest.Store4(block_offset_host, blocks);
 }
--- a/src/xenia/gpu/d3d12/shaders/texture_load_64bpb.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/texture_load_64bpb.cs.hlsl
@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
  // 1 thread = 4 uint2 blocks.
  uint3 block_index = xe_thread_id;
  block_index.x <<= 2u;
-  [branch] if (any(block_index >= xe_texture_copy_size)) {
+  [branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
    return;
  }
  uint4 block_offsets_guest =
@ -17,8 +17,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
  blocks_01 = XeByteSwap(blocks_01, xe_texture_copy_endianness);
  blocks_23 = XeByteSwap(blocks_23, xe_texture_copy_endianness);
  uint block_offset_host = XeTextureHostLinearOffset(
-      block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 8u) +
-      xe_texture_copy_host_base;
+      block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
+      8u) + xe_texture_copy_host_base;
  xe_texture_copy_dest.Store4(block_offset_host, blocks_01);
  xe_texture_copy_dest.Store4(block_offset_host + 16u, blocks_23);
 }
--- a/src/xenia/gpu/d3d12/shaders/texture_load_8bpb.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/texture_load_8bpb.cs.hlsl
@ -5,7 +5,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
  // 1 thread = 4 ubyte blocks.
  uint3 block_index = xe_thread_id;
  block_index.x <<= 2u;
-  [branch] if (any(block_index >= xe_texture_copy_size)) {
+  [branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
    return;
  }
  uint4 block_offsets_guest =
@ -20,7 +20,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
  blocks.xy |= blocks.zw;
  blocks.x |= blocks.y;
  uint block_offset_host = XeTextureHostLinearOffset(
-      block_index, xe_texture_copy_size.y, xe_texture_copy_host_pitch, 1u) +
-      xe_texture_copy_host_base;
+      block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch,
+      1u) + xe_texture_copy_host_base;
  xe_texture_copy_dest.Store(block_offset_host, blocks.x);
 }
--- a/src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl
@ -0,0 +1,101 @@
+#include "texture_copy.hlsli"
+
+// http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
+// CXT1 is like DXT3/5 color, but 2-component and with 8:8 endpoints rather than
+// 5:6:5.
+//
+// Dword 1:
+// rrrrrrrr gggggggg
+// RRRRRRRR GGGGGGGG
+// Dword 2:
+// AA BB CC DD
+// EE FF GG HH
+// II JJ KK LL
+// MM NN OO PP
+
+void XeCTX1FourBlocksRowToR8G8(uint4 weights_high, uint weights_shift,
+                               uint4 end_low_rr00gg00, uint4 end_high_rr00gg00,
+                               out uint4 row_01, out uint4 row_23) {
+  uint4 weights_low = ~weights_high;
+  uint4 weights_shifts = uint4(0u, 2u, 4u, 6u) + weights_shift;
+  uint4 row_3aaaa =
+      ((weights_low >> weights_shifts.x) & 3u) * end_low_rr00gg00 +
+      ((weights_high >> weights_shifts.x) & 3u) * end_high_rr00gg00;
+  uint4 row_3bbbb =
+      ((weights_low >> weights_shifts.y) & 3u) * end_low_rr00gg00 +
+      ((weights_high >> weights_shifts.y) & 3u) * end_high_rr00gg00;
+  uint4 row_3cccc =
+      ((weights_low >> weights_shifts.z) & 3u) * end_low_rr00gg00 +
+      ((weights_high >> weights_shifts.z) & 3u) * end_high_rr00gg00;
+  uint4 row_3dddd =
+      ((weights_low >> weights_shifts.w) & 3u) * end_low_rr00gg00 +
+      ((weights_high >> weights_shifts.w) & 3u) * end_high_rr00gg00;
+  uint4 row_half_3acac = uint4(row_3aaaa.xy, row_3cccc.xy).xzyw;
+  uint4 row_half_3bdbd = uint4(row_3bbbb.xy, row_3dddd.xy).xzyw;
+  // R0A G0A R0B G0B | R0C G0C R0D G0D | R1A G1A R1B G1B | R1C G1C R1D G1D
+  row_01 = ((row_half_3acac & 0xFFFFu) / 3u) |
+           (((row_half_3acac >> 16u) / 3u) << 8u) |
+           (((row_half_3bdbd & 0xFFFFu) / 3u) << 16u) |
+           (((row_half_3bdbd >> 16u) / 3u) << 24u);
+  row_half_3acac = uint4(row_3aaaa.zw, row_3cccc.zw).xzyw;
+  row_half_3bdbd = uint4(row_3bbbb.zw, row_3dddd.zw).xzyw;
+  // R2A G2A R2B G2B | R2C G2C R2D G2D | R3A G3A R3B G3B | R3C G3C R3D G3D
+  row_23 = ((row_half_3acac & 0xFFFFu) / 3u) |
+           (((row_half_3acac >> 16u) / 3u) << 8u) |
+           (((row_half_3bdbd & 0xFFFFu) / 3u) << 16u) |
+           (((row_half_3bdbd >> 16u) / 3u) << 24u);
+}
+
+[numthreads(8, 32, 1)]
+void main(uint3 xe_thread_id : SV_DispatchThreadID) {
+  // 1 thread = 4 CTX1 (8bpb) blocks to 16x4 R8G8 texels.
+  uint3 block_index = xe_thread_id;
+  block_index.x <<= 2u;
+  [branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
+    return;
+  }
+  uint4 block_offsets_guest =
+      XeTextureCopyGuestBlockOffsets(block_index, 8u, 3u);
+  uint4 blocks_01 = uint4(xe_texture_copy_source.Load2(block_offsets_guest.x),
+                          xe_texture_copy_source.Load2(block_offsets_guest.y));
+  uint4 blocks_23 = uint4(xe_texture_copy_source.Load2(block_offsets_guest.z),
+                          xe_texture_copy_source.Load2(block_offsets_guest.w));
+  blocks_01 = XeByteSwap(blocks_01, xe_texture_copy_endianness);
+  blocks_23 = XeByteSwap(blocks_23, xe_texture_copy_endianness);
+
+  // Sort the color indices so they can be used as weights for the second
+  // endpoint. Initially 00 = 3:0, 01 = 0:3, 10 = 2:1, 11 = 1:2.
+  uint4 weights_high = uint4(blocks_01.yw, blocks_23.yw);
+  // Swap bits. 00 = 3:0, 01 = 2:1, 10 = 0:3, 11 = 1:2.
+  weights_high = ((weights_high & 0x55555555u) << 1u) |
+                 ((weights_high & 0xAAAAAAAAu) >> 1u);
+  // Swap 10 and 11. 00 = 3:0, 01 = 2:1, 10 = 1:2, 11 = 0:3.
+  weights_high ^= ((weights_high & 0xAAAAAAAAu) >> 1u);
+
+  // Unpack the endpoints as:
+  // 0x00g000r0 0x00g100r1 0x00g200r2 0x00g300r3
+  // 0x00G000R0 0x00G100R1 0x00G200R2 0x00G300R3
+  // so they can be multiplied by their weights allowing overflow.
+  uint4 end_packed = uint4(blocks_01.xz, blocks_23.xz);
+  uint4 end_low_rr00gg00 =
+      (end_packed & 0xFFu) | ((end_packed & 0xFF00u) << 8u);
+  uint4 end_high_rr00gg00 =
+      ((end_packed & 0xFF0000u) >> 16u) | ((end_packed & 0xFF000000u) >> 8u);
+
+  // Uncompress and write the rows.
+  uint3 texel_index_host = block_index << uint3(2u, 2u, 0u);
+  uint texel_offset_host = XeTextureHostLinearOffset(
+      texel_index_host, xe_texture_copy_size_texels.y,
+      xe_texture_copy_host_pitch, 2u) + xe_texture_copy_host_base;
+  for (uint i = 0u; i < 4u; ++i) {
+    uint4 row_01, row_23;
+    XeCTX1FourBlocksRowToR8G8(weights_high, i * 8u, end_low_rr00gg00,
+                              end_high_rr00gg00, row_01, row_23);
+    xe_texture_copy_dest.Store4(texel_offset_host, row_01);
+    xe_texture_copy_dest.Store4(texel_offset_host + 16u, row_23);
+    if (++texel_index_host.y >= xe_texture_copy_size_texels.y) {
+      return;
+    }
+    texel_offset_host += xe_texture_copy_host_pitch;
+  }
+}
--- a/src/xenia/gpu/d3d12/texture_cache.cc
+++ b/src/xenia/gpu/d3d12/texture_cache.cc
@ -29,6 +29,7 @@ namespace d3d12 {
 #include "xenia/gpu/d3d12/shaders/bin/texture_load_32bpb_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/texture_load_64bpb_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/texture_load_8bpb_cs.h"
+#include "xenia/gpu/d3d12/shaders/bin/texture_load_ctx1_cs.h"

 const TextureCache::HostFormat TextureCache::host_formats_[64] = {
    {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown},           // k_1_REVERSE
@ -92,7 +93,7 @@ const TextureCache::HostFormat TextureCache::host_formats_[64] = {
    {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown},  // k_32_32_32_FLOAT
    {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown},  // k_DXT3A
    {DXGI_FORMAT_BC4_UNORM, CopyMode::k64bpb},  // k_DXT5A
-    {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown},  // k_CTX1
+    {DXGI_FORMAT_R8G8_UNORM, CopyMode::kCTX1},  // k_CTX1
    {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown},  // k_DXT3A_AS_1_1_1_1
    {DXGI_FORMAT_R8G8B8A8_UNORM, CopyMode::k32bpb},  // k_8_8_8_8_GAMMA
    {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown},       // k_2_10_10_10_FLOAT_EDRAM
@ -107,6 +108,7 @@ const TextureCache::CopyModeInfo TextureCache::copy_mode_info_[] = {
    {texture_load_32bpb_cs, sizeof(texture_load_32bpb_cs)},
    {texture_load_64bpb_cs, sizeof(texture_load_64bpb_cs)},
    {texture_load_128bpb_cs, sizeof(texture_load_128bpb_cs)},
+    {texture_load_ctx1_cs, sizeof(texture_load_ctx1_cs)},
 };

 TextureCache::TextureCache(D3D12CommandProcessor* command_processor,
@ -820,11 +822,14 @@ bool TextureCache::LoadTextureData(Texture* texture) {
                                       : texture->mip_pitches[j];
      copy_constants.host_base = uint32_t(host_layouts[j].Offset);
      copy_constants.host_pitch = host_layouts[j].Footprint.RowPitch;
-      copy_constants.size[0] =
-          (std::max(width >> j, 1u) + (block_width - 1)) / block_width;
-      copy_constants.size[1] =
-          (std::max(height >> j, 1u) + (block_height - 1)) / block_height;
-      copy_constants.size[2] = std::max(depth >> j, 1u);
+      copy_constants.size_texels[0] = std::max(width >> j, 1u);
+      copy_constants.size_texels[1] = std::max(height >> j, 1u);
+      copy_constants.size_texels[2] = std::max(depth >> j, 1u);
+      copy_constants.size_blocks[0] =
+          (copy_constants.size_texels[0] + (block_width - 1)) / block_width;
+      copy_constants.size_blocks[1] =
+          (copy_constants.size_texels[1] + (block_height - 1)) / block_height;
+      copy_constants.size_blocks[2] = copy_constants.size_texels[2];
      if (texture->key.packed_mips) {
        texture_util::GetPackedMipOffset(width, height, depth, guest_format, j,
                                         copy_constants.guest_mip_offset[0],
@ -843,9 +848,9 @@ bool TextureCache::LoadTextureData(Texture* texture) {
      std::memcpy(cbuffer_mapping, &copy_constants, sizeof(copy_constants));
      command_list->SetComputeRootConstantBufferView(0, cbuffer_gpu_address);
      // Each thread group processes 32x32x1 blocks.
-      command_list->Dispatch((copy_constants.size[0] + 31) >> 5,
-                             (copy_constants.size[1] + 31) >> 5,
-                             copy_constants.size[2]);
+      command_list->Dispatch((copy_constants.size_blocks[0] + 31) >> 5,
+                             (copy_constants.size_blocks[1] + 31) >> 5,
+                             copy_constants.size_blocks[2]);
    }
    barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_UAV;
    barriers[0].UAV.pResource = copy_buffer;
--- a/src/xenia/gpu/d3d12/texture_cache.h
+++ b/src/xenia/gpu/d3d12/texture_cache.h
@ -90,6 +90,7 @@ class TextureCache {
    k32bpb,
    k64bpb,
    k128bpb,
+    kCTX1,

    kCount,

@ -194,14 +195,16 @@ class TextureCache {
    uint32_t host_pitch;

    // vec4 1.
-    // Size in blocks.
-    uint32_t size[3];
+    uint32_t size_texels[3];
    uint32_t is_3d;

    // vec4 2.
+    uint32_t size_blocks[3];
+    uint32_t endianness;
+
+    // vec4 3.
    // Offset within the packed mip for small mips.
    uint32_t guest_mip_offset[3];
-    uint32_t endianness;

    static constexpr uint32_t kGuestPitchTiled = UINT32_MAX;
  };