[D3D12] 64bpp raw resolve and EDRAM refactoring

2018-09-16 15:11:11 +03:00 · 2018-09-16 15:11:11 +03:00 · c9ffe98d21
parent 5be78ab369
commit c9ffe98d21
17 changed files with 188 additions and 106 deletions
--- a/src/xenia/gpu/d3d12/render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/render_target_cache.cc
@ -41,6 +41,7 @@ namespace d3d12 {
 #include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_float_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_unorm_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/edram_tile_sample_32bpp_cs.h"
+#include "xenia/gpu/d3d12/shaders/bin/edram_tile_sample_64bpp_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/resolve_ps.h"
 #include "xenia/gpu/d3d12/shaders/bin/resolve_vs.h"

@ -173,6 +174,16 @@ bool RenderTargetCache::Initialize() {
    return false;
  }
  edram_tile_sample_32bpp_pipeline_->SetName(L"EDRAM Raw Resolve 32bpp");
+  // Tile single sample into a texture - 64 bits per pixel.
+  edram_tile_sample_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
+      device, edram_tile_sample_64bpp_cs, sizeof(edram_tile_sample_64bpp_cs),
+      edram_load_store_root_signature_);
+  if (edram_tile_sample_64bpp_pipeline_ == nullptr) {
+    XELOGE("Failed to create the 64bpp EDRAM raw resolve pipeline");
+    Shutdown();
+    return false;
+  }
+  edram_tile_sample_64bpp_pipeline_->SetName(L"EDRAM Raw Resolve 64bpp");
  // Clear 32-bit color or unorm depth.
  edram_clear_32bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
      device, edram_clear_32bpp_cs, sizeof(edram_clear_32bpp_cs),
@ -258,6 +269,7 @@ void RenderTargetCache::Shutdown() {
  }
  resolve_pipelines_.clear();
  ui::d3d12::util::ReleaseAndNull(resolve_root_signature_);
+  ui::d3d12::util::ReleaseAndNull(edram_tile_sample_64bpp_pipeline_);
  ui::d3d12::util::ReleaseAndNull(edram_tile_sample_32bpp_pipeline_);
  ui::d3d12::util::ReleaseAndNull(edram_clear_depth_float_pipeline_);
  ui::d3d12::util::ReleaseAndNull(edram_clear_32bpp_pipeline_);
@ -977,9 +989,10 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,

  // Validate and clamp the source region, skip parts that don't need to be
  // copied and calculate the number of threads needed for copying/loading.
-  uint32_t surface_pitch_tiles, row_tiles, rows;
+  uint32_t surface_pitch_tiles, row_width_ss_div_80, rows;
  if (!GetEDRAMLayout(surface_pitch, msaa_samples, src_64bpp, edram_base,
-                      copy_rect, surface_pitch_tiles, row_tiles, rows)) {
+                      copy_rect, surface_pitch_tiles, row_width_ss_div_80,
+                      rows)) {
    // Nothing to copy.
    return true;
  }
@ -1008,10 +1021,6 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
    // Raw copy
    // *************************************************************************
    XELOGGPU("Resolve: Copying using a compute shader");
-    if (src_64bpp) {
-      // TODO(Triang3l): 64bpp sample copy shader.
-      return false;
-    }

    // Make sure we have the memory to write to.
    if (!shared_memory->MakeTilesResident(dest_address, dest_size)) {
@ -1079,10 +1088,11 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
    command_list->SetComputeRoot32BitConstants(
        0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
    command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
-    // TODO(Triang3l): 64bpp pipeline.
-    command_processor_->SetComputePipeline(edram_tile_sample_32bpp_pipeline_);
-    // 1 group per destination 80x16 (32bpp) / 80x8 (64bpp) region.
-    uint32_t group_count_x = row_tiles, group_count_y = rows;
+    command_processor_->SetComputePipeline(
+        src_64bpp ? edram_tile_sample_64bpp_pipeline_
+                  : edram_tile_sample_32bpp_pipeline_);
+    // 1 group per destination 80x16 region.
+    uint32_t group_count_x = row_width_ss_div_80, group_count_y = rows;
    if (msaa_samples >= MsaaSamples::k2X) {
      group_count_y = (group_count_y + 1) >> 1;
      if (msaa_samples >= MsaaSamples::k4X) {
@ -1121,7 +1131,7 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
      return false;
    }
    RenderTargetKey render_target_key;
-    render_target_key.width_ss_div_80 = row_tiles >> (src_64bpp ? 1 : 0);
+    render_target_key.width_ss_div_80 = row_width_ss_div_80;
    render_target_key.height_ss_div_16 = rows;
    render_target_key.is_depth = false;
    render_target_key.format = src_format;
@ -1190,7 +1200,8 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,

    command_processor_->SetComputePipeline(
        edram_load_pipelines_[size_t(GetLoadStoreMode(false, src_format))]);
-    command_list->Dispatch(row_tiles, rows, 1);
+    // 1 group per 80x16 samples.
+    command_list->Dispatch(row_width_ss_div_80, rows, 1);
    command_processor_->PushUAVBarrier(copy_buffer);

    // Go to the next descriptor set.
@ -1405,9 +1416,10 @@ bool RenderTargetCache::ResolveClear(uint32_t edram_base,
  bool is_64bpp =
      !is_depth && IsColorFormat64bpp(ColorRenderTargetFormat(format));
  D3D12_RECT clear_rect = rect;
-  uint32_t surface_pitch_tiles, row_tiles, rows;
+  uint32_t surface_pitch_tiles, row_width_ss_div_80, rows;
  if (!GetEDRAMLayout(surface_pitch, msaa_samples, is_64bpp, edram_base,
-                      clear_rect, surface_pitch_tiles, row_tiles, rows)) {
+                      clear_rect, surface_pitch_tiles, row_width_ss_div_80,
+                      rows)) {
    // Nothing to clear.
    return true;
  }
@ -1475,7 +1487,8 @@ bool RenderTargetCache::ResolveClear(uint32_t edram_base,
  ui::d3d12::util::CreateRawBufferUAV(device, descriptor_cpu_start,
                                      edram_buffer_, kEDRAMBufferSize);
  command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
-  command_list->Dispatch(row_tiles, rows, 1);
+  // 1 group per 80x16 samples.
+  command_list->Dispatch(row_width_ss_div_80, rows, 1);
  command_processor_->PushUAVBarrier(edram_buffer_);

  return true;
@ -1871,7 +1884,7 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
 bool RenderTargetCache::GetEDRAMLayout(
    uint32_t pitch_pixels, MsaaSamples msaa_samples, bool is_64bpp,
    uint32_t& base_in_out, D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out,
-    uint32_t& row_tiles_out, uint32_t& rows_out) {
+    uint32_t& row_width_ss_div_80_out, uint32_t& rows_out) {
  if (pitch_pixels == 0 || rect_in_out.right <= 0 || rect_in_out.bottom <= 0 ||
      rect_in_out.top >= rect_in_out.bottom) {
    return false;
@ -1921,8 +1934,7 @@ bool RenderTargetCache::GetEDRAMLayout(
  base_in_out = base;
  rect_in_out = rect;
  pitch_tiles_out = pitch_tiles;
-  row_tiles_out = (((rect.right << samples_x_log2) + 79) / 80)
-                  << sample_size_log2;
+  row_width_ss_div_80_out = ((rect.right << samples_x_log2) + 79) / 80;
  rows_out = rows;
  return true;
 }
@ -2044,14 +2056,6 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
    const RenderTarget* render_target = binding.render_target;
    bool is_64bpp = false;

-    // Get the number of X thread groups.
-    uint32_t rt_pitch_tiles = surface_pitch_tiles;
-    if (!render_target->key.is_depth &&
-        IsColorFormat64bpp(
-            ColorRenderTargetFormat(render_target->key.format))) {
-      rt_pitch_tiles *= 2;
-    }
-
    // Transition the copy buffer to copy destination.
    command_processor_->PushTransitionBarrier(copy_buffer, copy_buffer_state,
                                              D3D12_RESOURCE_STATE_COPY_DEST);
@ -2084,6 +2088,12 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
      root_constants.rt_stencil_pitch =
          location_dest.PlacedFootprint.Footprint.RowPitch;
    }
+    uint32_t rt_pitch_tiles = surface_pitch_tiles;
+    if (!render_target->key.is_depth &&
+        IsColorFormat64bpp(
+            ColorRenderTargetFormat(render_target->key.format))) {
+      rt_pitch_tiles *= 2;
+    }
    root_constants.base_pitch_tiles =
        binding.edram_base | (rt_pitch_tiles << 11);

@ -2101,7 +2111,8 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
                                               render_target->key.format);
    command_processor_->SetComputePipeline(
        edram_store_pipelines_[size_t(mode)]);
-    command_list->Dispatch(rt_pitch_tiles, binding.edram_dirty_rows, 1);
+    // 1 group per 80x16 samples.
+    command_list->Dispatch(surface_pitch_tiles, binding.edram_dirty_rows, 1);

    // Commit the UAV write.
    command_processor_->PushUAVBarrier(edram_buffer_);
@ -2178,7 +2189,7 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
    }
    const RenderTarget* render_target = render_targets[i];

-    // Get the number of X thread groups.
+    // Get the number of EDRAM tiles per row.
    uint32_t edram_pitch_tiles = render_target->key.width_ss_div_80;
    if (!render_target->key.is_depth &&
        IsColorFormat64bpp(
@ -2218,7 +2229,8 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
    EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
                                               render_target->key.format);
    command_processor_->SetComputePipeline(edram_load_pipelines_[size_t(mode)]);
-    command_list->Dispatch(edram_pitch_tiles, edram_rows, 1);
+    // 1 group per 80x16 samples.
+    command_list->Dispatch(render_target->key.width_ss_div_80, edram_rows, 1);

    // Commit the UAV write and transition the copy buffer to copy source now.
    command_processor_->PushUAVBarrier(copy_buffer);
--- a/src/xenia/gpu/d3d12/render_target_cache.h
+++ b/src/xenia/gpu/d3d12/render_target_cache.h
@ -381,7 +381,8 @@ class RenderTargetCache {
  static bool GetEDRAMLayout(uint32_t pitch_pixels, MsaaSamples msaa_samples,
                             bool is_64bpp, uint32_t& base_in_out,
                             D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out,
-                             uint32_t& row_tiles_out, uint32_t& rows_out);
+                             uint32_t& row_width_ss_div_80_out,
+                             uint32_t& rows_out);

  static EDRAMLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format);

@ -480,6 +481,7 @@ class RenderTargetCache {
  ID3D12PipelineState*
      edram_store_pipelines_[size_t(EDRAMLoadStoreMode::kCount)] = {};
  ID3D12PipelineState* edram_tile_sample_32bpp_pipeline_ = nullptr;
+  ID3D12PipelineState* edram_tile_sample_64bpp_pipeline_ = nullptr;
  ID3D12PipelineState* edram_clear_32bpp_pipeline_ = nullptr;
  ID3D12PipelineState* edram_clear_depth_float_pipeline_ = nullptr;

--- a/src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl
@ -15,9 +15,9 @@ void main(uint3 xe_group_id : SV_GroupID,
               any(sample_index >= clear_rect.zw)) {
    return;
  }
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 2u;
-  uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 2u;
+  uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
  xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_color32);
  if (sample_index.x + 1u < clear_rect.z) {
    xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_color32);
--- a/src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl
@ -15,11 +15,11 @@ void main(uint3 xe_group_id : SV_GroupID,
               any(sample_index >= clear_rect.zw)) {
    return;
  }
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 2u;
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 2u;
  bool second_sample_inside = sample_index.x + 1u < clear_rect.z;
  // 24-bit depth.
-  uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
+  uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
  xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_depth24);
  [branch] if (second_sample_inside) {
    xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_depth24);
--- a/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl
@ -4,11 +4,11 @@
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 4u;
-  uint4 pixels = xe_edram_load_store_source.Load4(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 4u;
+  uint4 samples = xe_edram_load_store_source.Load4(
+      XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index));
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  xe_edram_load_store_dest.Store4(rt_offset, pixels);
+  xe_edram_load_store_dest.Store4(rt_offset, samples);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl
@ -1,19 +1,14 @@
 #include "edram_load_store.hlsli"

-[numthreads(40, 8, 1)]
+[numthreads(40, 16, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
-  // One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
-  // from 1 render target row rather than 1. Threads with X 0-19 are for the
-  // first row, with 20-39 are for the second.
-  uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
-  [flatten] if (xe_group_thread_id.x >= 20u) {
-    tile_dword_index += uint2(uint(-80), 1u);
-  }
-  uint4 pixels = xe_edram_load_store_source.Load4(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 2u;
+  uint4 samples = xe_edram_load_store_source.Load4(
+      XeEDRAMOffset64bpp(xe_group_id.xy, tile_sample_index));
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  xe_edram_load_store_dest.Store4(rt_offset, pixels);
+  xe_edram_load_store_dest.Store4(rt_offset, samples);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl
@ -5,16 +5,16 @@
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 2u;
-  uint2 pixels_7e3_packed = xe_edram_load_store_source.Load2(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
-  uint4 pixel_0_f16u32 = XeFloat7e3To16(pixels_7e3_packed.x);
-  uint4 pixel_1_f16u32 = XeFloat7e3To16(pixels_7e3_packed.y);
-  uint4 pixels_f16u32_packed =
-      uint4(pixel_0_f16u32.xz, pixel_1_f16u32.xz) |
-      (uint4(pixel_0_f16u32.yw, pixel_1_f16u32.yw) << 16u);
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 2u;
+  uint2 samples_7e3_packed = xe_edram_load_store_source.Load2(
+      XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index));
+  uint4 sample_0_f16u32 = XeFloat7e3To16(samples_7e3_packed.x);
+  uint4 sample_1_f16u32 = XeFloat7e3To16(samples_7e3_packed.y);
+  uint4 samples_f16u32_packed =
+      uint4(sample_0_f16u32.xz, sample_1_f16u32.xz) |
+      (uint4(sample_0_f16u32.yw, sample_1_f16u32.yw) << 16u);
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  xe_edram_load_store_dest.Store4(rt_offset, pixels_f16u32_packed);
+  xe_edram_load_store_dest.Store4(rt_offset, samples_f16u32_packed);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl
@ -5,9 +5,9 @@
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 4u;
-  uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 4u;
+  uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
  uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset);
  uint4 depth24 = depth24_stencil >> 8u;
  uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset);
--- a/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl
@ -4,16 +4,16 @@
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 4u;
-  uint4 pixels = xe_edram_load_store_source.Load4(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 4u;
+  uint4 samples = xe_edram_load_store_source.Load4(
+      XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index));
  // Depth.
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  xe_edram_load_store_dest.Store4(rt_offset, pixels >> 8u);
+  xe_edram_load_store_dest.Store4(rt_offset, samples >> 8u);
  // Stencil.
-  uint4 stencil = (pixels & 0xFFu) << uint4(0u, 8u, 16u, 24u);
+  uint4 stencil = (samples & 0xFFu) << uint4(0u, 8u, 16u, 24u);
  stencil.xy |= stencil.zw;
  stencil.x |= stencil.y;
  rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
--- a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
@ -25,10 +25,10 @@ cbuffer XeEDRAMLoadStoreConstants : register(b0) {
 // 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
 // 18:20 - destination endianness.
 // 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
-//   For 32 bits per pixel:
+//   For 32 bits per sample:
 //     21:25 - red/blue bit depth.
 //     26:30 - blue offset.
-//   For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
+//   For 64 bits per sample, it's 1 if need to swap 0:15 and 32:47.
 #define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w)

 // For clearing.
@ -45,10 +45,20 @@ ByteAddressBuffer xe_edram_load_store_source : register(t0);
 #endif
 RWByteAddressBuffer xe_edram_load_store_dest : register(u0);

-uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) {
+uint XeEDRAMOffset32bpp(uint2 tile_index, uint2 tile_sample_index) {
  return ((xe_edram_base_pitch_tiles & 2047u) +
          tile_index.y * (xe_edram_base_pitch_tiles >> 11u) + tile_index.x) *
-         5120u + tile_dword_index.y * 320u + tile_dword_index.x * 4u;
+         5120u + tile_sample_index.y * 320u + tile_sample_index.x * 4u;
+}
+
+// Instead of individual tiles, this works on two consecutive tiles, the first
+// one containing the top 80x8 samples, and the second one containing the bottom
+// 80x8 samples.
+uint XeEDRAMOffset64bpp(uint2 tile_pair_index, uint2 tile_pair_sample_index) {
+  return ((xe_edram_base_pitch_tiles & 2047u) +
+          tile_pair_index.y * (xe_edram_base_pitch_tiles >> 11u) +
+          (tile_pair_index.x << 1u)) * 5120u +
+         tile_pair_sample_index.y * 640u + tile_pair_sample_index.x * 8u;
 }

 #endif  // XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
--- a/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl
@ -6,9 +6,9 @@ void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 4u;
+  uint4 samples = xe_edram_load_store_source.Load4(rt_offset);
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 4u;
  xe_edram_load_store_dest.Store4(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
+      XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index), samples);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl
@ -1,19 +1,14 @@
 #include "edram_load_store.hlsli"

-[numthreads(40, 8, 1)]
+[numthreads(40, 16, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
-  // One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
-  // from 1 render target row rather than 1. Threads with X 0-19 are for the
-  // first row, with 20-39 are for the second.
-  uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
-  [flatten] if (xe_group_thread_id.x >= 20u) {
-    tile_dword_index += uint2(uint(-80), 1u);
-  }
+  uint4 samples = xe_edram_load_store_source.Load4(rt_offset);
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 2u;
  xe_edram_load_store_dest.Store4(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
+      XeEDRAMOffset64bpp(xe_group_id.xy, tile_sample_index), samples);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl
@ -7,13 +7,14 @@ void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  uint4 pixels_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset);
-  uint4 pixel_0_f16u32 = pixels_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u);
-  uint4 pixel_1_f16u32 = pixels_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u);
-  uint2 pixels_7e3_packed =
-      uint2(XeFloat16To7e3(pixel_0_f16u32), XeFloat16To7e3(pixel_1_f16u32));
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 2u;
+  uint4 samples_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset);
+  uint4 sample_0_f16u32 = samples_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u);
+  uint4 sample_1_f16u32 = samples_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u);
+  uint2 samples_7e3_packed =
+      uint2(XeFloat16To7e3(sample_0_f16u32), XeFloat16To7e3(sample_1_f16u32));
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 2u;
  xe_edram_load_store_dest.Store2(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels_7e3_packed);
+      XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index),
+      samples_7e3_packed);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl
@ -15,9 +15,9 @@ void main(uint3 xe_group_id : SV_GroupID,
              xe_edram_rt_stencil_offset;
  depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
                      uint4(0u, 8u, 16u, 24u)) & 0xFFu;
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 4u;
-  uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 4u;
+  uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
  // Store 24-bit depth for aliasing and checking if 32-bit depth is up to date.
  xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil);
  // Store 32-bit depth so precision isn't lost when doing multipass rendering.
--- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl
@ -7,15 +7,15 @@ void main(uint3 xe_group_id : SV_GroupID,
  // Depth.
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  uint4 pixels =
+  uint4 samples =
      (xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu) << 8u;
  // Stencil.
  rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
              xe_edram_rt_stencil_offset;
-  pixels |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
+  samples |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
             uint4(0u, 8u, 16u, 24u)) & 0xFFu;
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 4u;
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 4u;
  xe_edram_load_store_dest.Store4(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
+      XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index), samples);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl
@ -23,7 +23,7 @@ void main(uint3 xe_group_id : SV_GroupID,
      (xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u;
  uint2 edram_tile_quarter =
      uint2(uint2(10u, 8u) <= xe_group_thread_id.xy) * sample_info.xy;
-  uint edram_offset = XeEDRAMOffset(
+  uint edram_offset = XeEDRAMOffset32bpp(
      (xe_group_id.xy << sample_info.xy) + edram_tile_quarter,
      (xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) <<
      (sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
--- a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_64bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_64bpp.cs.hlsl
@ -0,0 +1,67 @@
+#include "byte_swap.hlsli"
+#include "edram_load_store.hlsli"
+#include "texture_address.hlsli"
+
+[numthreads(20, 16, 1)]
+void main(uint3 xe_group_id : SV_GroupID,
+          uint3 xe_group_thread_id : SV_GroupThreadID,
+          uint3 xe_thread_id : SV_DispatchThreadID) {
+  // Check if not outside of the destination texture completely.
+  uint4 copy_rect;
+  copy_rect.xz = xe_edram_tile_sample_rect & 0xFFFFu;
+  copy_rect.yw = xe_edram_tile_sample_rect >> 16u;
+  uint2 texel_index = xe_thread_id.xy;
+  texel_index.x *= 4u;
+  [branch] if (any(texel_index < copy_rect.xy) ||
+               any(texel_index >= copy_rect.zw)) {
+    return;
+  }
+
+  // Get the samples from the EDRAM buffer.
+  // XY - log2(pixel size), ZW - selected sample offset.
+  uint4 sample_info =
+      (xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u;
+  uint2 edram_tile_quarter =
+      uint2(uint2(10u, 8u) <= xe_group_thread_id.xy) * sample_info.xy;
+  uint edram_offset = XeEDRAMOffset64bpp(
+      (xe_group_id.xy << sample_info.xy) + edram_tile_quarter,
+      (xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) <<
+      (sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
+  // Loaded with the first 2 pixels at 1x and 2x, or the first 1 pixel at 4x.
+  uint4 pixels_01 = xe_edram_load_store_source.Load4(edram_offset);
+  // Loaded with the second 2 pixels at 1x and 2x, or the second 1 pixel at 4x.
+  uint4 pixels_23 = xe_edram_load_store_source.Load4(edram_offset + 16u);
+  [branch] if (sample_info.x != 0u) {
+    // Rather than 4 pixels, at 4x, we only have 2 - in xy of each variable
+    // rather than in xyzw of pixels_01. Combine and load 2 more.
+    pixels_01.zw = pixels_23.xy;
+    pixels_23.xy = xe_edram_load_store_source.Load2(edram_offset + 32u);
+    pixels_23.zw = xe_edram_load_store_source.Load2(edram_offset + 48u);
+  }
+
+  if ((xe_edram_tile_sample_dest_info >> 21u) != 0u) {
+    // Swap red and blue - all 64bpp formats where this is possible are
+    // 16:16:16:16.
+    pixels_01 = (pixels_01 & 0xFFFF0000u) | (pixels_01.yxwz & 0xFFFFu);
+    pixels_23 = (pixels_23 & 0xFFFF0000u) | (pixels_23.yxwz & 0xFFFFu);
+  }
+
+  // Tile the pixels to the shared memory.
+  pixels_01 = XeByteSwap(pixels_01, xe_edram_tile_sample_dest_info >> 18u);
+  pixels_23 = XeByteSwap(pixels_23, xe_edram_tile_sample_dest_info >> 18u);
+  uint4 texel_addresses =
+      xe_edram_tile_sample_dest_base +
+      XeTextureTiledOffset2D(texel_index - copy_rect.xy,
+                             xe_edram_tile_sample_dest_info & 16383u, 3u);
+  xe_edram_load_store_dest.Store2(texel_addresses.x, pixels_01.xy);
+  bool3 texels_in_rect = uint3(1u, 2u, 3u) + texel_index.x < copy_rect.z;
+  [branch] if (texels_in_rect.x) {
+    xe_edram_load_store_dest.Store2(texel_addresses.y, pixels_01.zw);
+    [branch] if (texels_in_rect.y) {
+      xe_edram_load_store_dest.Store2(texel_addresses.z, pixels_23.xy);
+      [branch] if (texels_in_rect.z) {
+        xe_edram_load_store_dest.Store2(texel_addresses.w, pixels_23.zw);
+      }
+    }
+  }
+}