From ea1abdaa6eb53737c842b87d33494a94bad0f6f9 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Thu, 23 Aug 2018 13:25:36 +0300
Subject: [PATCH] [D3D12] Raw 32bpp resolve

---
 src/xenia/gpu/d3d12/render_target_cache.cc    | 104 +++++++++++++++++-
 src/xenia/gpu/d3d12/render_target_cache.h     |   8 +-
 src/xenia/gpu/d3d12/shaders/byte_swap.hlsli   |  24 +++-
 .../gpu/d3d12/shaders/edram_load_store.hlsli  |   8 +-
 .../shaders/edram_tile_sample_32bpp.cs.hlsl   |  23 ++--
 .../gpu/d3d12/shaders/pixel_formats.hlsli     |   2 +-
 src/xenia/gpu/d3d12/shared_memory.cc          |   2 +-
 src/xenia/gpu/d3d12/shared_memory.h           |   3 +-
 8 files changed, 146 insertions(+), 28 deletions(-)

diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc
index 4e7add734..28adddf6f 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/render_target_cache.cc
@@ -893,9 +893,10 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
     assert_always();
     return false;
   }
+  Endian128 dest_endian = Endian128(dest_info & 0x7);
   int32_t dest_exp_bias =
       !is_depth ? (int32_t((dest_info >> 16) << 26) >> 26) : 0;
-  uint32_t dest_swap = (dest_info >> 24) & 0x1;
+  bool dest_swap = !is_depth && ((dest_info >> 24) & 0x1);
 
   // Get the destination location.
   uint32_t dest_address = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32 & 0x1FFFFFFF;
@@ -950,14 +951,105 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
   //   RTV of the destination format.
   auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
   auto device = provider->GetDevice();
+  auto descriptor_size_view = provider->GetDescriptorSizeView();
   if (sample_select <= xenos::CopySampleSelect::k3 &&
       src_texture_format == dest_format && dest_exp_bias == 0) {
     XELOGGPU("Resolving a single sample without conversion");
+    if (src_64bpp) {
+      // TODO(Triang3l): 64bpp sample copy shader.
+      return false;
+    }
+
     // Make sure we have the memory to write to.
     if (!shared_memory->MakeTilesResident(dest_address, dest_size)) {
       return false;
     }
-    // TODO(Triang3l): Raw resolve.
+
+    // Write the source and destination descriptors.
+    D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start;
+    D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start;
+    if (command_processor_->RequestViewDescriptors(
+            0, 2, 2, descriptor_cpu_start, descriptor_gpu_start) == 0) {
+      return false;
+    }
+    D3D12_SHADER_RESOURCE_VIEW_DESC srv_desc;
+    srv_desc.Format = DXGI_FORMAT_R32_TYPELESS;
+    srv_desc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
+    srv_desc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
+    srv_desc.Buffer.FirstElement = 0;
+    srv_desc.Buffer.NumElements = 2 * 2048 * 1280;
+    srv_desc.Buffer.StructureByteStride = 0;
+    srv_desc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW;
+    device->CreateShaderResourceView(edram_buffer_, &srv_desc,
+                                     descriptor_cpu_start);
+    D3D12_CPU_DESCRIPTOR_HANDLE uav_cpu_handle;
+    uav_cpu_handle.ptr = descriptor_cpu_start.ptr + descriptor_size_view;
+    shared_memory->CreateRawUAV(uav_cpu_handle);
+
+    // Transition the buffers.
+    command_processor_->PushTransitionBarrier(
+        edram_buffer_, edram_buffer_state_,
+        D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
+    edram_buffer_state_ = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
+    shared_memory->UseForWriting();
+    command_processor_->SubmitBarriers();
+
+    // Dispatch the computation.
+    command_list->SetComputeRootSignature(edram_load_store_root_signature_);
+    EDRAMLoadStoreRootConstants root_constants;
+    root_constants.tile_sample_rect_tl = copy_rect.left | (copy_rect.top << 16);
+    root_constants.tile_sample_rect_br =
+        copy_rect.right | (copy_rect.bottom << 16);
+    root_constants.tile_sample_dest_base = dest_address;
+    assert_true(dest_pitch <= 8192);
+    root_constants.tile_sample_dest_info = dest_pitch |
+                                           (uint32_t(sample_select) << 16) |
+                                           (uint32_t(dest_endian) << 18);
+    if (msaa_samples >= MsaaSamples::k2X) {
+      root_constants.tile_sample_dest_info |= 1 << 14;
+      if (msaa_samples >= MsaaSamples::k4X) {
+        root_constants.tile_sample_dest_info |= 1 << 15;
+      }
+    }
+    if (dest_swap) {
+      switch (ColorRenderTargetFormat(src_format)) {
+        case ColorRenderTargetFormat::k_8_8_8_8:
+        case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
+          root_constants.tile_sample_dest_info |= (8 << 21) | (16 << 26);
+          break;
+        case ColorRenderTargetFormat::k_2_10_10_10:
+        case ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
+        case ColorRenderTargetFormat::k_2_10_10_10_AS_16_16_16_16:
+        case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
+          root_constants.tile_sample_dest_info |= (10 << 21) | (20 << 26);
+          break;
+        case ColorRenderTargetFormat::k_16_16_16_16:
+        case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
+          root_constants.tile_sample_dest_info |= 1 << 21;
+          break;
+        default:
+          break;
+      }
+    }
+    root_constants.base_pitch_tiles = edram_base | (surface_pitch_tiles << 11);
+    command_list->SetComputeRoot32BitConstants(
+        0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
+    command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
+    // TODO(Triang3l): 64bpp pipeline.
+    command_processor_->SetPipeline(edram_tile_sample_32bpp_pipeline_);
+    // 1 group per destination 80x16 (32bpp) / 80x8 (64bpp) region.
+    uint32_t group_count_x = row_tiles, group_count_y = rows;
+    if (msaa_samples >= MsaaSamples::k2X) {
+      group_count_y = (group_count_y + 1) >> 1;
+      if (msaa_samples >= MsaaSamples::k4X) {
+        group_count_x = (group_count_x + 1) >> 1;
+      }
+    }
+    command_list->Dispatch(group_count_x, group_count_y, 1);
+
+    // Commit the write.
+    command_processor_->PushUAVBarrier(shared_memory->GetBuffer());
+
     // Make the texture cache refresh the data.
     shared_memory->RangeWrittenByGPU(dest_address, dest_size);
   } else {
@@ -1386,8 +1478,6 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
     command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
                                     nullptr);
     EDRAMLoadStoreRootConstants root_constants;
-    root_constants.base_pitch_tiles =
-        binding.edram_base | (rt_pitch_tiles << 11);
     root_constants.rt_color_depth_offset =
         uint32_t(location_dest.PlacedFootprint.Offset);
     root_constants.rt_color_depth_pitch =
@@ -1402,6 +1492,8 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
       root_constants.rt_stencil_pitch =
           location_dest.PlacedFootprint.Footprint.RowPitch;
     }
+    root_constants.base_pitch_tiles =
+        binding.edram_base | (rt_pitch_tiles << 11);
 
     // Transition the copy buffer to SRV.
     command_processor_->PushTransitionBarrier(
@@ -1534,8 +1626,6 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
     // Load the data.
     command_processor_->SubmitBarriers();
     EDRAMLoadStoreRootConstants root_constants;
-    root_constants.base_pitch_tiles =
-        edram_bases[i] | (edram_pitch_tiles << 11);
     root_constants.rt_color_depth_offset =
         uint32_t(render_target->footprints[0].Offset);
     root_constants.rt_color_depth_pitch =
@@ -1546,6 +1636,8 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
       root_constants.rt_stencil_pitch =
           render_target->footprints[1].Footprint.RowPitch;
     }
+    root_constants.base_pitch_tiles =
+        edram_bases[i] | (edram_pitch_tiles << 11);
     command_list->SetComputeRoot32BitConstants(
         0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
     EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h
index 3c26b5f47..253624854 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.h
+++ b/src/xenia/gpu/d3d12/render_target_cache.h
@@ -385,11 +385,11 @@ class RenderTargetCache {
         // 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
         // 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
         // 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
-        // 18:19 - destination endianness.
-        // 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
+        // 18:20 - destination endianness.
+        // 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
         //   For 32 bits per pixel:
-        //     20:24 - red/blue bit depth.
-        //     25:29 - blue offset.
+        //     21:25 - red/blue bit depth.
+        //     26:30 - blue offset.
         //   For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
         uint32_t tile_sample_dest_info;
       };
diff --git a/src/xenia/gpu/d3d12/shaders/byte_swap.hlsli b/src/xenia/gpu/d3d12/shaders/byte_swap.hlsli
index b345a5ad3..1cdd55ae1 100644
--- a/src/xenia/gpu/d3d12/shaders/byte_swap.hlsli
+++ b/src/xenia/gpu/d3d12/shaders/byte_swap.hlsli
@@ -1,12 +1,15 @@
 #ifndef XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_
 #define XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_
 
+// These functions may accept endianness without it being masked with & 3 -
+// don't use ==, <=, >= here!
+
 #define XE_BYTE_SWAP_OVERLOAD(XeByteSwapType) \
 XeByteSwapType XeByteSwap(XeByteSwapType v, uint endian) { \
-  [flatten] if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
+  if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
     v = ((v & 0x00FF00FFu) << 8u) | ((v & 0xFF00FF00u) >> 8u); \
   } \
-  [flatten] if ((endian & 2u) != 0u) { \
+  if ((endian & 2u) != 0u) { \
     v = (v << 16u) | (v >> 16u); \
   } \
   return v; \
@@ -18,7 +21,7 @@ XE_BYTE_SWAP_OVERLOAD(uint4)
 
 #define XE_BYTE_SWAP_16_OVERLOAD(XeByteSwapType) \
 XeByteSwapType XeByteSwap16(XeByteSwapType v, uint endian) { \
-  [flatten] if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
+  if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
     v = (v << 8u) | (v >> 8u); \
   } \
   return v; \
@@ -28,4 +31,19 @@ XE_BYTE_SWAP_16_OVERLOAD(uint2)
 XE_BYTE_SWAP_16_OVERLOAD(uint3)
 XE_BYTE_SWAP_16_OVERLOAD(uint4)
 
+uint2 XeByteSwap64(uint2 v, uint endian) {
+  if (endian & 4u) {
+    v = v.yx;
+    endian = 2u;
+  }
+  return XeByteSwap(v, endian);
+}
+uint4 XeByteSwap64(uint4 v, uint endian) {
+  if (endian & 4u) {
+    v = v.yxwz;
+    endian = 2u;
+  }
+  return XeByteSwap(v, endian);
+}
+
 #endif  // XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_
diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
index 0314abd6d..e55b783a4 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
@@ -23,11 +23,11 @@ cbuffer XeEDRAMLoadStoreConstants : register(b0) {
 // 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
 // 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
 // 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
-// 18:19 - destination endianness.
-// 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
+// 18:20 - destination endianness.
+// 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
 //   For 32 bits per pixel:
-//     20:24 - red/blue bit depth.
-//     25:29 - blue offset.
+//     21:25 - red/blue bit depth.
+//     26:30 - blue offset.
 //   For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
 #define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w)
 
diff --git a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl
index 29633a1aa..0df7dbdbb 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl
@@ -1,3 +1,4 @@
+#include "byte_swap.hlsli"
 #include "edram_load_store.hlsli"
 #include "texture_address.hlsli"
 
@@ -6,8 +7,9 @@ void main(uint3 xe_group_id : SV_GroupID,
           uint3 xe_group_thread_id : SV_GroupThreadID,
           uint3 xe_thread_id : SV_DispatchThreadID) {
   // Check if not outside of the destination texture completely.
-  uint4 copy_rect =
-      (xe_edram_tile_sample_rect.xyxy >> uint4(0u, 0u, 16u, 16u)) & 0xFFFFu;
+  uint4 copy_rect;
+  copy_rect.xz = xe_edram_tile_sample_rect & 0xFFFFu;
+  copy_rect.yw = xe_edram_tile_sample_rect >> 16u;
   uint2 texel_index = xe_thread_id.xy;
   texel_index.x *= 4u;
   [branch] if (any(texel_index < copy_rect.xy) ||
@@ -19,9 +21,12 @@ void main(uint3 xe_group_id : SV_GroupID,
   // XY - log2(pixel size), ZW - selected sample offset.
   uint4 sample_info =
       (xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u;
+  uint2 edram_tile_quarter =
+      uint2(uint2(10u, 8u) <= xe_group_thread_id) * sample_info.xy;
   uint edram_offset = XeEDRAMOffset(
-      xe_group_id.xy << sample_info.xy,
-      xe_thread_id.xy << (sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
+      (xe_group_id.xy << sample_info.xy) + edram_tile_quarter,
+      (xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) <<
+      (sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
   // At 1x and 2x, this contains samples of 4 pixels. At 4x, this contains
   // samples of 2, need to load 2 more.
   uint4 pixels = xe_edram_load_store_source.Load4(edram_offset);
@@ -30,7 +35,7 @@ void main(uint3 xe_group_id : SV_GroupID,
     pixels.zw = xe_edram_load_store_source.Load3(edram_offset + 16u).xz;
   }
 
-  uint red_blue_swap = xe_edram_tile_sample_dest_info >> 20u;
+  uint red_blue_swap = xe_edram_tile_sample_dest_info >> 21u;
   if (red_blue_swap != 0u) {
     uint red_mask = (1u << (red_blue_swap & 31u)) - 1u;
     // No need to be ready for a long shift Barney, it's just 16 or 20.
@@ -42,16 +47,18 @@ void main(uint3 xe_group_id : SV_GroupID,
   }
 
   // Tile the pixels to the shared memory.
+  pixels = XeByteSwap(pixels, xe_edram_tile_sample_dest_info >> 18u);
   uint4 texel_addresses =
       xe_edram_tile_sample_dest_base +
       XeTextureTiledOffset2D(texel_index - copy_rect.xy,
                              xe_edram_tile_sample_dest_info & 16383u, 2u);
   xe_edram_load_store_dest.Store(texel_addresses.x, pixels.x);
-  [branch] if (texel_index.x + 1u < copy_rect.z) {
+  bool3 texels_in_rect = uint3(1u, 2u, 3u) + texel_index.x < copy_rect.z;
+  [branch] if (texels_in_rect.x) {
     xe_edram_load_store_dest.Store(texel_addresses.y, pixels.y);
-    [branch] if (texel_index.x + 2u < copy_rect.z) {
+    [branch] if (texels_in_rect.y) {
       xe_edram_load_store_dest.Store(texel_addresses.z, pixels.z);
-      [branch] if (texel_index.x + 3u < copy_rect.z) {
+      [branch] if (texels_in_rect.z) {
         xe_edram_load_store_dest.Store(texel_addresses.w, pixels.w);
       }
     }
diff --git a/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli
index fbdbb0221..eff4e0a7d 100644
--- a/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli
+++ b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli
@@ -57,7 +57,7 @@ uint4 XeFloat32To20e4(uint4 f32u32) {
 }
 
 uint4 XeFloat20e4To32(uint4 f24u32) {
-  uint4 mantissa = f24u32 & 0xF00000u;
+  uint4 mantissa = f24u32 & 0xFFFFFu;
   uint4 exponent = f24u32 >> 20u;
   // Normalize the values for the denormalized components.
   // Exponent = 1;
diff --git a/src/xenia/gpu/d3d12/shared_memory.cc b/src/xenia/gpu/d3d12/shared_memory.cc
index 118ac9e62..158d1decd 100644
--- a/src/xenia/gpu/d3d12/shared_memory.cc
+++ b/src/xenia/gpu/d3d12/shared_memory.cc
@@ -541,7 +541,7 @@ void SharedMemory::CreateSRV(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
   device->CreateShaderResourceView(buffer_, &desc, handle);
 }
 
-void SharedMemory::CreateUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
+void SharedMemory::CreateRawUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
   auto device =
       command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice();
   D3D12_UNORDERED_ACCESS_VIEW_DESC desc;
diff --git a/src/xenia/gpu/d3d12/shared_memory.h b/src/xenia/gpu/d3d12/shared_memory.h
index 23f85dfa9..39b21eb8a 100644
--- a/src/xenia/gpu/d3d12/shared_memory.h
+++ b/src/xenia/gpu/d3d12/shared_memory.h
@@ -36,6 +36,7 @@ class SharedMemory {
   bool Initialize();
   void Shutdown();
 
+  ID3D12Resource* GetBuffer() const { return buffer_; }
   D3D12_GPU_VIRTUAL_ADDRESS GetGPUAddress() const {
     return buffer_gpu_address_;
   }
@@ -90,7 +91,7 @@ class SharedMemory {
   void UseForWriting();
 
   void CreateSRV(D3D12_CPU_DESCRIPTOR_HANDLE handle);
-  void CreateUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle);
+  void CreateRawUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle);
 
  private:
   D3D12CommandProcessor* command_processor_;