[D3D12] Raw 32bpp resolve

2018-08-23 13:25:36 +03:00 · 2018-08-23 13:25:36 +03:00 · ea1abdaa6e
parent bc4125584c
commit ea1abdaa6e
8 changed files with 146 additions and 28 deletions
--- a/src/xenia/gpu/d3d12/render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/render_target_cache.cc
@ -893,9 +893,10 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
    assert_always();
    return false;
  }
+  Endian128 dest_endian = Endian128(dest_info & 0x7);
  int32_t dest_exp_bias =
      !is_depth ? (int32_t((dest_info >> 16) << 26) >> 26) : 0;
-  uint32_t dest_swap = (dest_info >> 24) & 0x1;
+  bool dest_swap = !is_depth && ((dest_info >> 24) & 0x1);

  // Get the destination location.
  uint32_t dest_address = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32 & 0x1FFFFFFF;
@ -950,14 +951,105 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
  //   RTV of the destination format.
  auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
  auto device = provider->GetDevice();
+  auto descriptor_size_view = provider->GetDescriptorSizeView();
  if (sample_select <= xenos::CopySampleSelect::k3 &&
      src_texture_format == dest_format && dest_exp_bias == 0) {
    XELOGGPU("Resolving a single sample without conversion");
+    if (src_64bpp) {
+      // TODO(Triang3l): 64bpp sample copy shader.
+      return false;
+    }
+
    // Make sure we have the memory to write to.
    if (!shared_memory->MakeTilesResident(dest_address, dest_size)) {
      return false;
    }
-    // TODO(Triang3l): Raw resolve.
+
+    // Write the source and destination descriptors.
+    D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start;
+    D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start;
+    if (command_processor_->RequestViewDescriptors(
+            0, 2, 2, descriptor_cpu_start, descriptor_gpu_start) == 0) {
+      return false;
+    }
+    D3D12_SHADER_RESOURCE_VIEW_DESC srv_desc;
+    srv_desc.Format = DXGI_FORMAT_R32_TYPELESS;
+    srv_desc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
+    srv_desc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
+    srv_desc.Buffer.FirstElement = 0;
+    srv_desc.Buffer.NumElements = 2 * 2048 * 1280;
+    srv_desc.Buffer.StructureByteStride = 0;
+    srv_desc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW;
+    device->CreateShaderResourceView(edram_buffer_, &srv_desc,
+                                     descriptor_cpu_start);
+    D3D12_CPU_DESCRIPTOR_HANDLE uav_cpu_handle;
+    uav_cpu_handle.ptr = descriptor_cpu_start.ptr + descriptor_size_view;
+    shared_memory->CreateRawUAV(uav_cpu_handle);
+
+    // Transition the buffers.
+    command_processor_->PushTransitionBarrier(
+        edram_buffer_, edram_buffer_state_,
+        D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
+    edram_buffer_state_ = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
+    shared_memory->UseForWriting();
+    command_processor_->SubmitBarriers();
+
+    // Dispatch the computation.
+    command_list->SetComputeRootSignature(edram_load_store_root_signature_);
+    EDRAMLoadStoreRootConstants root_constants;
+    root_constants.tile_sample_rect_tl = copy_rect.left | (copy_rect.top << 16);
+    root_constants.tile_sample_rect_br =
+        copy_rect.right | (copy_rect.bottom << 16);
+    root_constants.tile_sample_dest_base = dest_address;
+    assert_true(dest_pitch <= 8192);
+    root_constants.tile_sample_dest_info = dest_pitch |
+                                           (uint32_t(sample_select) << 16) |
+                                           (uint32_t(dest_endian) << 18);
+    if (msaa_samples >= MsaaSamples::k2X) {
+      root_constants.tile_sample_dest_info |= 1 << 14;
+      if (msaa_samples >= MsaaSamples::k4X) {
+        root_constants.tile_sample_dest_info |= 1 << 15;
+      }
+    }
+    if (dest_swap) {
+      switch (ColorRenderTargetFormat(src_format)) {
+        case ColorRenderTargetFormat::k_8_8_8_8:
+        case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
+          root_constants.tile_sample_dest_info |= (8 << 21) | (16 << 26);
+          break;
+        case ColorRenderTargetFormat::k_2_10_10_10:
+        case ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
+        case ColorRenderTargetFormat::k_2_10_10_10_AS_16_16_16_16:
+        case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
+          root_constants.tile_sample_dest_info |= (10 << 21) | (20 << 26);
+          break;
+        case ColorRenderTargetFormat::k_16_16_16_16:
+        case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
+          root_constants.tile_sample_dest_info |= 1 << 21;
+          break;
+        default:
+          break;
+      }
+    }
+    root_constants.base_pitch_tiles = edram_base | (surface_pitch_tiles << 11);
+    command_list->SetComputeRoot32BitConstants(
+        0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
+    command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
+    // TODO(Triang3l): 64bpp pipeline.
+    command_processor_->SetPipeline(edram_tile_sample_32bpp_pipeline_);
+    // 1 group per destination 80x16 (32bpp) / 80x8 (64bpp) region.
+    uint32_t group_count_x = row_tiles, group_count_y = rows;
+    if (msaa_samples >= MsaaSamples::k2X) {
+      group_count_y = (group_count_y + 1) >> 1;
+      if (msaa_samples >= MsaaSamples::k4X) {
+        group_count_x = (group_count_x + 1) >> 1;
+      }
+    }
+    command_list->Dispatch(group_count_x, group_count_y, 1);
+
+    // Commit the write.
+    command_processor_->PushUAVBarrier(shared_memory->GetBuffer());
+
    // Make the texture cache refresh the data.
    shared_memory->RangeWrittenByGPU(dest_address, dest_size);
  } else {
@ -1386,8 +1478,6 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
    command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
                                    nullptr);
    EDRAMLoadStoreRootConstants root_constants;
-    root_constants.base_pitch_tiles =
-        binding.edram_base | (rt_pitch_tiles << 11);
    root_constants.rt_color_depth_offset =
        uint32_t(location_dest.PlacedFootprint.Offset);
    root_constants.rt_color_depth_pitch =
@ -1402,6 +1492,8 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
      root_constants.rt_stencil_pitch =
          location_dest.PlacedFootprint.Footprint.RowPitch;
    }
+    root_constants.base_pitch_tiles =
+        binding.edram_base | (rt_pitch_tiles << 11);

    // Transition the copy buffer to SRV.
    command_processor_->PushTransitionBarrier(
@ -1534,8 +1626,6 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
    // Load the data.
    command_processor_->SubmitBarriers();
    EDRAMLoadStoreRootConstants root_constants;
-    root_constants.base_pitch_tiles =
-        edram_bases[i] | (edram_pitch_tiles << 11);
    root_constants.rt_color_depth_offset =
        uint32_t(render_target->footprints[0].Offset);
    root_constants.rt_color_depth_pitch =
@ -1546,6 +1636,8 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
      root_constants.rt_stencil_pitch =
          render_target->footprints[1].Footprint.RowPitch;
    }
+    root_constants.base_pitch_tiles =
+        edram_bases[i] | (edram_pitch_tiles << 11);
    command_list->SetComputeRoot32BitConstants(
        0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
    EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
--- a/src/xenia/gpu/d3d12/render_target_cache.h
+++ b/src/xenia/gpu/d3d12/render_target_cache.h
@ -385,11 +385,11 @@ class RenderTargetCache {
        // 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
        // 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
        // 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
-        // 18:19 - destination endianness.
-        // 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
+        // 18:20 - destination endianness.
+        // 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
        //   For 32 bits per pixel:
-        //     20:24 - red/blue bit depth.
-        //     25:29 - blue offset.
+        //     21:25 - red/blue bit depth.
+        //     26:30 - blue offset.
        //   For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
        uint32_t tile_sample_dest_info;
      };
--- a/src/xenia/gpu/d3d12/shaders/byte_swap.hlsli
+++ b/src/xenia/gpu/d3d12/shaders/byte_swap.hlsli
@ -1,12 +1,15 @@
 #ifndef XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_
 #define XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_

+// These functions may accept endianness without it being masked with & 3 -
+// don't use ==, <=, >= here!
+
 #define XE_BYTE_SWAP_OVERLOAD(XeByteSwapType) \
 XeByteSwapType XeByteSwap(XeByteSwapType v, uint endian) { \
-  [flatten] if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
+  if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
    v = ((v & 0x00FF00FFu) << 8u) | ((v & 0xFF00FF00u) >> 8u); \
  } \
-  [flatten] if ((endian & 2u) != 0u) { \
+  if ((endian & 2u) != 0u) { \
    v = (v << 16u) | (v >> 16u); \
  } \
  return v; \
@ -18,7 +21,7 @@ XE_BYTE_SWAP_OVERLOAD(uint4)

 #define XE_BYTE_SWAP_16_OVERLOAD(XeByteSwapType) \
 XeByteSwapType XeByteSwap16(XeByteSwapType v, uint endian) { \
-  [flatten] if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
+  if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
    v = (v << 8u) | (v >> 8u); \
  } \
  return v; \
@ -28,4 +31,19 @@ XE_BYTE_SWAP_16_OVERLOAD(uint2)
 XE_BYTE_SWAP_16_OVERLOAD(uint3)
 XE_BYTE_SWAP_16_OVERLOAD(uint4)

+uint2 XeByteSwap64(uint2 v, uint endian) {
+  if (endian & 4u) {
+    v = v.yx;
+    endian = 2u;
+  }
+  return XeByteSwap(v, endian);
+}
+uint4 XeByteSwap64(uint4 v, uint endian) {
+  if (endian & 4u) {
+    v = v.yxwz;
+    endian = 2u;
+  }
+  return XeByteSwap(v, endian);
+}
+
 #endif  // XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_
--- a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
@ -23,11 +23,11 @@ cbuffer XeEDRAMLoadStoreConstants : register(b0) {
 // 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
 // 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
 // 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
-// 18:19 - destination endianness.
-// 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
+// 18:20 - destination endianness.
+// 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
 //   For 32 bits per pixel:
-//     20:24 - red/blue bit depth.
-//     25:29 - blue offset.
+//     21:25 - red/blue bit depth.
+//     26:30 - blue offset.
 //   For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
 #define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w)

--- a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl
@ -1,3 +1,4 @@
+#include "byte_swap.hlsli"
 #include "edram_load_store.hlsli"
 #include "texture_address.hlsli"

@ -6,8 +7,9 @@ void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  // Check if not outside of the destination texture completely.
-  uint4 copy_rect =
-      (xe_edram_tile_sample_rect.xyxy >> uint4(0u, 0u, 16u, 16u)) & 0xFFFFu;
+  uint4 copy_rect;
+  copy_rect.xz = xe_edram_tile_sample_rect & 0xFFFFu;
+  copy_rect.yw = xe_edram_tile_sample_rect >> 16u;
  uint2 texel_index = xe_thread_id.xy;
  texel_index.x *= 4u;
  [branch] if (any(texel_index < copy_rect.xy) ||
@ -19,9 +21,12 @@ void main(uint3 xe_group_id : SV_GroupID,
  // XY - log2(pixel size), ZW - selected sample offset.
  uint4 sample_info =
      (xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u;
+  uint2 edram_tile_quarter =
+      uint2(uint2(10u, 8u) <= xe_group_thread_id) * sample_info.xy;
  uint edram_offset = XeEDRAMOffset(
-      xe_group_id.xy << sample_info.xy,
-      xe_thread_id.xy << (sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
+      (xe_group_id.xy << sample_info.xy) + edram_tile_quarter,
+      (xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) <<
+      (sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
  // At 1x and 2x, this contains samples of 4 pixels. At 4x, this contains
  // samples of 2, need to load 2 more.
  uint4 pixels = xe_edram_load_store_source.Load4(edram_offset);
@ -30,7 +35,7 @@ void main(uint3 xe_group_id : SV_GroupID,
    pixels.zw = xe_edram_load_store_source.Load3(edram_offset + 16u).xz;
  }

-  uint red_blue_swap = xe_edram_tile_sample_dest_info >> 20u;
+  uint red_blue_swap = xe_edram_tile_sample_dest_info >> 21u;
  if (red_blue_swap != 0u) {
    uint red_mask = (1u << (red_blue_swap & 31u)) - 1u;
    // No need to be ready for a long shift Barney, it's just 16 or 20.
@ -42,16 +47,18 @@ void main(uint3 xe_group_id : SV_GroupID,
  }

  // Tile the pixels to the shared memory.
+  pixels = XeByteSwap(pixels, xe_edram_tile_sample_dest_info >> 18u);
  uint4 texel_addresses =
      xe_edram_tile_sample_dest_base +
      XeTextureTiledOffset2D(texel_index - copy_rect.xy,
                             xe_edram_tile_sample_dest_info & 16383u, 2u);
  xe_edram_load_store_dest.Store(texel_addresses.x, pixels.x);
-  [branch] if (texel_index.x + 1u < copy_rect.z) {
+  bool3 texels_in_rect = uint3(1u, 2u, 3u) + texel_index.x < copy_rect.z;
+  [branch] if (texels_in_rect.x) {
    xe_edram_load_store_dest.Store(texel_addresses.y, pixels.y);
-    [branch] if (texel_index.x + 2u < copy_rect.z) {
+    [branch] if (texels_in_rect.y) {
      xe_edram_load_store_dest.Store(texel_addresses.z, pixels.z);
-      [branch] if (texel_index.x + 3u < copy_rect.z) {
+      [branch] if (texels_in_rect.z) {
        xe_edram_load_store_dest.Store(texel_addresses.w, pixels.w);
      }
    }
--- a/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli
+++ b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli
@ -57,7 +57,7 @@ uint4 XeFloat32To20e4(uint4 f32u32) {
 }

 uint4 XeFloat20e4To32(uint4 f24u32) {
-  uint4 mantissa = f24u32 & 0xF00000u;
+  uint4 mantissa = f24u32 & 0xFFFFFu;
  uint4 exponent = f24u32 >> 20u;
  // Normalize the values for the denormalized components.
  // Exponent = 1;
--- a/src/xenia/gpu/d3d12/shared_memory.cc
+++ b/src/xenia/gpu/d3d12/shared_memory.cc
@ -541,7 +541,7 @@ void SharedMemory::CreateSRV(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
  device->CreateShaderResourceView(buffer_, &desc, handle);
 }

-void SharedMemory::CreateUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
+void SharedMemory::CreateRawUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
  auto device =
      command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice();
  D3D12_UNORDERED_ACCESS_VIEW_DESC desc;
--- a/src/xenia/gpu/d3d12/shared_memory.h
+++ b/src/xenia/gpu/d3d12/shared_memory.h
@ -36,6 +36,7 @@ class SharedMemory {
  bool Initialize();
  void Shutdown();

+  ID3D12Resource* GetBuffer() const { return buffer_; }
  D3D12_GPU_VIRTUAL_ADDRESS GetGPUAddress() const {
    return buffer_gpu_address_;
  }
@ -90,7 +91,7 @@ class SharedMemory {
  void UseForWriting();

  void CreateSRV(D3D12_CPU_DESCRIPTOR_HANDLE handle);
-  void CreateUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle);
+  void CreateRawUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle);

 private:
  D3D12CommandProcessor* command_processor_;