From c9ffe98d21ddb440959a06d639aabf756df0a8e6 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Sun, 16 Sep 2018 15:11:11 +0300
Subject: [PATCH] [D3D12] 64bpp raw resolve and EDRAM refactoring

---
 src/xenia/gpu/d3d12/render_target_cache.cc    | 70 +++++++++++--------
 src/xenia/gpu/d3d12/render_target_cache.h     |  4 +-
 .../d3d12/shaders/edram_clear_32bpp.cs.hlsl   |  6 +-
 .../shaders/edram_clear_depth_float.cs.hlsl   |  6 +-
 .../shaders/edram_load_color_32bpp.cs.hlsl    | 10 +--
 .../shaders/edram_load_color_64bpp.cs.hlsl    | 17 ++---
 .../shaders/edram_load_color_7e3.cs.hlsl      | 20 +++---
 .../shaders/edram_load_depth_float.cs.hlsl    |  6 +-
 .../shaders/edram_load_depth_unorm.cs.hlsl    | 12 ++--
 .../gpu/d3d12/shaders/edram_load_store.hlsli  | 18 +++--
 .../shaders/edram_store_color_32bpp.cs.hlsl   |  8 +--
 .../shaders/edram_store_color_64bpp.cs.hlsl   | 15 ++--
 .../shaders/edram_store_color_7e3.cs.hlsl     | 17 ++---
 .../shaders/edram_store_depth_float.cs.hlsl   |  6 +-
 .../shaders/edram_store_depth_unorm.cs.hlsl   | 10 +--
 .../shaders/edram_tile_sample_32bpp.cs.hlsl   |  2 +-
 .../shaders/edram_tile_sample_64bpp.cs.hlsl   | 67 ++++++++++++++++++
 17 files changed, 188 insertions(+), 106 deletions(-)
 create mode 100644 src/xenia/gpu/d3d12/shaders/edram_tile_sample_64bpp.cs.hlsl

diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc
index 2acc6d52c..8d36e1854 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/render_target_cache.cc
@@ -41,6 +41,7 @@ namespace d3d12 {
 #include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_float_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_unorm_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/edram_tile_sample_32bpp_cs.h"
+#include "xenia/gpu/d3d12/shaders/bin/edram_tile_sample_64bpp_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/resolve_ps.h"
 #include "xenia/gpu/d3d12/shaders/bin/resolve_vs.h"
 
@@ -173,6 +174,16 @@ bool RenderTargetCache::Initialize() {
     return false;
   }
   edram_tile_sample_32bpp_pipeline_->SetName(L"EDRAM Raw Resolve 32bpp");
+  // Tile single sample into a texture - 64 bits per pixel.
+  edram_tile_sample_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
+      device, edram_tile_sample_64bpp_cs, sizeof(edram_tile_sample_64bpp_cs),
+      edram_load_store_root_signature_);
+  if (edram_tile_sample_64bpp_pipeline_ == nullptr) {
+    XELOGE("Failed to create the 64bpp EDRAM raw resolve pipeline");
+    Shutdown();
+    return false;
+  }
+  edram_tile_sample_64bpp_pipeline_->SetName(L"EDRAM Raw Resolve 64bpp");
   // Clear 32-bit color or unorm depth.
   edram_clear_32bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
       device, edram_clear_32bpp_cs, sizeof(edram_clear_32bpp_cs),
@@ -258,6 +269,7 @@ void RenderTargetCache::Shutdown() {
   }
   resolve_pipelines_.clear();
   ui::d3d12::util::ReleaseAndNull(resolve_root_signature_);
+  ui::d3d12::util::ReleaseAndNull(edram_tile_sample_64bpp_pipeline_);
   ui::d3d12::util::ReleaseAndNull(edram_tile_sample_32bpp_pipeline_);
   ui::d3d12::util::ReleaseAndNull(edram_clear_depth_float_pipeline_);
   ui::d3d12::util::ReleaseAndNull(edram_clear_32bpp_pipeline_);
@@ -977,9 +989,10 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
 
   // Validate and clamp the source region, skip parts that don't need to be
   // copied and calculate the number of threads needed for copying/loading.
-  uint32_t surface_pitch_tiles, row_tiles, rows;
+  uint32_t surface_pitch_tiles, row_width_ss_div_80, rows;
   if (!GetEDRAMLayout(surface_pitch, msaa_samples, src_64bpp, edram_base,
-                      copy_rect, surface_pitch_tiles, row_tiles, rows)) {
+                      copy_rect, surface_pitch_tiles, row_width_ss_div_80,
+                      rows)) {
     // Nothing to copy.
     return true;
   }
@@ -1008,10 +1021,6 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
     // Raw copy
     // *************************************************************************
     XELOGGPU("Resolve: Copying using a compute shader");
-    if (src_64bpp) {
-      // TODO(Triang3l): 64bpp sample copy shader.
-      return false;
-    }
 
     // Make sure we have the memory to write to.
     if (!shared_memory->MakeTilesResident(dest_address, dest_size)) {
@@ -1079,10 +1088,11 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
     command_list->SetComputeRoot32BitConstants(
         0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
     command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
-    // TODO(Triang3l): 64bpp pipeline.
-    command_processor_->SetComputePipeline(edram_tile_sample_32bpp_pipeline_);
-    // 1 group per destination 80x16 (32bpp) / 80x8 (64bpp) region.
-    uint32_t group_count_x = row_tiles, group_count_y = rows;
+    command_processor_->SetComputePipeline(
+        src_64bpp ? edram_tile_sample_64bpp_pipeline_
+                  : edram_tile_sample_32bpp_pipeline_);
+    // 1 group per destination 80x16 region.
+    uint32_t group_count_x = row_width_ss_div_80, group_count_y = rows;
     if (msaa_samples >= MsaaSamples::k2X) {
       group_count_y = (group_count_y + 1) >> 1;
       if (msaa_samples >= MsaaSamples::k4X) {
@@ -1121,7 +1131,7 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
       return false;
     }
     RenderTargetKey render_target_key;
-    render_target_key.width_ss_div_80 = row_tiles >> (src_64bpp ? 1 : 0);
+    render_target_key.width_ss_div_80 = row_width_ss_div_80;
     render_target_key.height_ss_div_16 = rows;
     render_target_key.is_depth = false;
     render_target_key.format = src_format;
@@ -1190,7 +1200,8 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
 
     command_processor_->SetComputePipeline(
         edram_load_pipelines_[size_t(GetLoadStoreMode(false, src_format))]);
-    command_list->Dispatch(row_tiles, rows, 1);
+    // 1 group per 80x16 samples.
+    command_list->Dispatch(row_width_ss_div_80, rows, 1);
     command_processor_->PushUAVBarrier(copy_buffer);
 
     // Go to the next descriptor set.
@@ -1405,9 +1416,10 @@ bool RenderTargetCache::ResolveClear(uint32_t edram_base,
   bool is_64bpp =
       !is_depth && IsColorFormat64bpp(ColorRenderTargetFormat(format));
   D3D12_RECT clear_rect = rect;
-  uint32_t surface_pitch_tiles, row_tiles, rows;
+  uint32_t surface_pitch_tiles, row_width_ss_div_80, rows;
   if (!GetEDRAMLayout(surface_pitch, msaa_samples, is_64bpp, edram_base,
-                      clear_rect, surface_pitch_tiles, row_tiles, rows)) {
+                      clear_rect, surface_pitch_tiles, row_width_ss_div_80,
+                      rows)) {
     // Nothing to clear.
     return true;
   }
@@ -1475,7 +1487,8 @@ bool RenderTargetCache::ResolveClear(uint32_t edram_base,
   ui::d3d12::util::CreateRawBufferUAV(device, descriptor_cpu_start,
                                       edram_buffer_, kEDRAMBufferSize);
   command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
-  command_list->Dispatch(row_tiles, rows, 1);
+  // 1 group per 80x16 samples.
+  command_list->Dispatch(row_width_ss_div_80, rows, 1);
   command_processor_->PushUAVBarrier(edram_buffer_);
 
   return true;
@@ -1871,7 +1884,7 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
 bool RenderTargetCache::GetEDRAMLayout(
     uint32_t pitch_pixels, MsaaSamples msaa_samples, bool is_64bpp,
     uint32_t& base_in_out, D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out,
-    uint32_t& row_tiles_out, uint32_t& rows_out) {
+    uint32_t& row_width_ss_div_80_out, uint32_t& rows_out) {
   if (pitch_pixels == 0 || rect_in_out.right <= 0 || rect_in_out.bottom <= 0 ||
       rect_in_out.top >= rect_in_out.bottom) {
     return false;
@@ -1921,8 +1934,7 @@ bool RenderTargetCache::GetEDRAMLayout(
   base_in_out = base;
   rect_in_out = rect;
   pitch_tiles_out = pitch_tiles;
-  row_tiles_out = (((rect.right << samples_x_log2) + 79) / 80)
-                  << sample_size_log2;
+  row_width_ss_div_80_out = ((rect.right << samples_x_log2) + 79) / 80;
   rows_out = rows;
   return true;
 }
@@ -2044,14 +2056,6 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
     const RenderTarget* render_target = binding.render_target;
     bool is_64bpp = false;
 
-    // Get the number of X thread groups.
-    uint32_t rt_pitch_tiles = surface_pitch_tiles;
-    if (!render_target->key.is_depth &&
-        IsColorFormat64bpp(
-            ColorRenderTargetFormat(render_target->key.format))) {
-      rt_pitch_tiles *= 2;
-    }
-
     // Transition the copy buffer to copy destination.
     command_processor_->PushTransitionBarrier(copy_buffer, copy_buffer_state,
                                               D3D12_RESOURCE_STATE_COPY_DEST);
@@ -2084,6 +2088,12 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
       root_constants.rt_stencil_pitch =
           location_dest.PlacedFootprint.Footprint.RowPitch;
     }
+    uint32_t rt_pitch_tiles = surface_pitch_tiles;
+    if (!render_target->key.is_depth &&
+        IsColorFormat64bpp(
+            ColorRenderTargetFormat(render_target->key.format))) {
+      rt_pitch_tiles *= 2;
+    }
     root_constants.base_pitch_tiles =
         binding.edram_base | (rt_pitch_tiles << 11);
 
@@ -2101,7 +2111,8 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
                                                render_target->key.format);
     command_processor_->SetComputePipeline(
         edram_store_pipelines_[size_t(mode)]);
-    command_list->Dispatch(rt_pitch_tiles, binding.edram_dirty_rows, 1);
+    // 1 group per 80x16 samples.
+    command_list->Dispatch(surface_pitch_tiles, binding.edram_dirty_rows, 1);
 
     // Commit the UAV write.
     command_processor_->PushUAVBarrier(edram_buffer_);
@@ -2178,7 +2189,7 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
     }
     const RenderTarget* render_target = render_targets[i];
 
-    // Get the number of X thread groups.
+    // Get the number of EDRAM tiles per row.
     uint32_t edram_pitch_tiles = render_target->key.width_ss_div_80;
     if (!render_target->key.is_depth &&
         IsColorFormat64bpp(
@@ -2218,7 +2229,8 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
     EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
                                                render_target->key.format);
     command_processor_->SetComputePipeline(edram_load_pipelines_[size_t(mode)]);
-    command_list->Dispatch(edram_pitch_tiles, edram_rows, 1);
+    // 1 group per 80x16 samples.
+    command_list->Dispatch(render_target->key.width_ss_div_80, edram_rows, 1);
 
     // Commit the UAV write and transition the copy buffer to copy source now.
     command_processor_->PushUAVBarrier(copy_buffer);
diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h
index df4c12120..07570fd28 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.h
+++ b/src/xenia/gpu/d3d12/render_target_cache.h
@@ -381,7 +381,8 @@ class RenderTargetCache {
   static bool GetEDRAMLayout(uint32_t pitch_pixels, MsaaSamples msaa_samples,
                              bool is_64bpp, uint32_t& base_in_out,
                              D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out,
-                             uint32_t& row_tiles_out, uint32_t& rows_out);
+                             uint32_t& row_width_ss_div_80_out,
+                             uint32_t& rows_out);
 
   static EDRAMLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format);
 
@@ -480,6 +481,7 @@ class RenderTargetCache {
   ID3D12PipelineState*
       edram_store_pipelines_[size_t(EDRAMLoadStoreMode::kCount)] = {};
   ID3D12PipelineState* edram_tile_sample_32bpp_pipeline_ = nullptr;
+  ID3D12PipelineState* edram_tile_sample_64bpp_pipeline_ = nullptr;
   ID3D12PipelineState* edram_clear_32bpp_pipeline_ = nullptr;
   ID3D12PipelineState* edram_clear_depth_float_pipeline_ = nullptr;
 
diff --git a/src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl
index 1609e1194..f123a7ef9 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl
@@ -15,9 +15,9 @@ void main(uint3 xe_group_id : SV_GroupID,
                any(sample_index >= clear_rect.zw)) {
     return;
   }
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 2u;
-  uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 2u;
+  uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
   xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_color32);
   if (sample_index.x + 1u < clear_rect.z) {
     xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_color32);
diff --git a/src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl
index 1b5ab59cf..3dc70ed73 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl
@@ -15,11 +15,11 @@ void main(uint3 xe_group_id : SV_GroupID,
                any(sample_index >= clear_rect.zw)) {
     return;
   }
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 2u;
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 2u;
   bool second_sample_inside = sample_index.x + 1u < clear_rect.z;
   // 24-bit depth.
-  uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
+  uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
   xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_depth24);
   [branch] if (second_sample_inside) {
     xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_depth24);
diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl
index 0bcdfb8e8..ead98d0cf 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl
@@ -4,11 +4,11 @@
 void main(uint3 xe_group_id : SV_GroupID,
           uint3 xe_group_thread_id : SV_GroupThreadID,
           uint3 xe_thread_id : SV_DispatchThreadID) {
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 4u;
-  uint4 pixels = xe_edram_load_store_source.Load4(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 4u;
+  uint4 samples = xe_edram_load_store_source.Load4(
+      XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index));
   uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                    xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  xe_edram_load_store_dest.Store4(rt_offset, pixels);
+  xe_edram_load_store_dest.Store4(rt_offset, samples);
 }
diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl
index a65aa4bf2..396462c85 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl
@@ -1,19 +1,14 @@
 #include "edram_load_store.hlsli"
 
-[numthreads(40, 8, 1)]
+[numthreads(40, 16, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
           uint3 xe_group_thread_id : SV_GroupThreadID,
           uint3 xe_thread_id : SV_DispatchThreadID) {
-  // One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
-  // from 1 render target row rather than 1. Threads with X 0-19 are for the
-  // first row, with 20-39 are for the second.
-  uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
-  [flatten] if (xe_group_thread_id.x >= 20u) {
-    tile_dword_index += uint2(uint(-80), 1u);
-  }
-  uint4 pixels = xe_edram_load_store_source.Load4(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 2u;
+  uint4 samples = xe_edram_load_store_source.Load4(
+      XeEDRAMOffset64bpp(xe_group_id.xy, tile_sample_index));
   uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                    xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  xe_edram_load_store_dest.Store4(rt_offset, pixels);
+  xe_edram_load_store_dest.Store4(rt_offset, samples);
 }
diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl
index 53c18687e..43b85be6e 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl
@@ -5,16 +5,16 @@
 void main(uint3 xe_group_id : SV_GroupID,
           uint3 xe_group_thread_id : SV_GroupThreadID,
           uint3 xe_thread_id : SV_DispatchThreadID) {
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 2u;
-  uint2 pixels_7e3_packed = xe_edram_load_store_source.Load2(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
-  uint4 pixel_0_f16u32 = XeFloat7e3To16(pixels_7e3_packed.x);
-  uint4 pixel_1_f16u32 = XeFloat7e3To16(pixels_7e3_packed.y);
-  uint4 pixels_f16u32_packed =
-      uint4(pixel_0_f16u32.xz, pixel_1_f16u32.xz) |
-      (uint4(pixel_0_f16u32.yw, pixel_1_f16u32.yw) << 16u);
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 2u;
+  uint2 samples_7e3_packed = xe_edram_load_store_source.Load2(
+      XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index));
+  uint4 sample_0_f16u32 = XeFloat7e3To16(samples_7e3_packed.x);
+  uint4 sample_1_f16u32 = XeFloat7e3To16(samples_7e3_packed.y);
+  uint4 samples_f16u32_packed =
+      uint4(sample_0_f16u32.xz, sample_1_f16u32.xz) |
+      (uint4(sample_0_f16u32.yw, sample_1_f16u32.yw) << 16u);
   uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                    xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  xe_edram_load_store_dest.Store4(rt_offset, pixels_f16u32_packed);
+  xe_edram_load_store_dest.Store4(rt_offset, samples_f16u32_packed);
 }
diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl
index 06eeb0080..b4c00fdd2 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl
@@ -5,9 +5,9 @@
 void main(uint3 xe_group_id : SV_GroupID,
           uint3 xe_group_thread_id : SV_GroupThreadID,
           uint3 xe_thread_id : SV_DispatchThreadID) {
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 4u;
-  uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 4u;
+  uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
   uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset);
   uint4 depth24 = depth24_stencil >> 8u;
   uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset);
diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl
index 0fdbadd2b..d8bcc069f 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl
@@ -4,16 +4,16 @@
 void main(uint3 xe_group_id : SV_GroupID,
           uint3 xe_group_thread_id : SV_GroupThreadID,
           uint3 xe_thread_id : SV_DispatchThreadID) {
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 4u;
-  uint4 pixels = xe_edram_load_store_source.Load4(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 4u;
+  uint4 samples = xe_edram_load_store_source.Load4(
+      XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index));
   // Depth.
   uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                    xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  xe_edram_load_store_dest.Store4(rt_offset, pixels >> 8u);
+  xe_edram_load_store_dest.Store4(rt_offset, samples >> 8u);
   // Stencil.
-  uint4 stencil = (pixels & 0xFFu) << uint4(0u, 8u, 16u, 24u);
+  uint4 stencil = (samples & 0xFFu) << uint4(0u, 8u, 16u, 24u);
   stencil.xy |= stencil.zw;
   stencil.x |= stencil.y;
   rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
index e572c2f03..dd17089a4 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
@@ -25,10 +25,10 @@ cbuffer XeEDRAMLoadStoreConstants : register(b0) {
 // 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
 // 18:20 - destination endianness.
 // 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
-//   For 32 bits per pixel:
+//   For 32 bits per sample:
 //     21:25 - red/blue bit depth.
 //     26:30 - blue offset.
-//   For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
+//   For 64 bits per sample, it's 1 if need to swap 0:15 and 32:47.
 #define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w)
 
 // For clearing.
@@ -45,10 +45,20 @@ ByteAddressBuffer xe_edram_load_store_source : register(t0);
 #endif
 RWByteAddressBuffer xe_edram_load_store_dest : register(u0);
 
-uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) {
+uint XeEDRAMOffset32bpp(uint2 tile_index, uint2 tile_sample_index) {
   return ((xe_edram_base_pitch_tiles & 2047u) +
           tile_index.y * (xe_edram_base_pitch_tiles >> 11u) + tile_index.x) *
-         5120u + tile_dword_index.y * 320u + tile_dword_index.x * 4u;
+         5120u + tile_sample_index.y * 320u + tile_sample_index.x * 4u;
+}
+
+// Instead of individual tiles, this works on two consecutive tiles, the first
+// one containing the top 80x8 samples, and the second one containing the bottom
+// 80x8 samples.
+uint XeEDRAMOffset64bpp(uint2 tile_pair_index, uint2 tile_pair_sample_index) {
+  return ((xe_edram_base_pitch_tiles & 2047u) +
+          tile_pair_index.y * (xe_edram_base_pitch_tiles >> 11u) +
+          (tile_pair_index.x << 1u)) * 5120u +
+         tile_pair_sample_index.y * 640u + tile_pair_sample_index.x * 8u;
 }
 
 #endif  // XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl
index db8038ae6..31c9badbc 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl
@@ -6,9 +6,9 @@ void main(uint3 xe_group_id : SV_GroupID,
           uint3 xe_thread_id : SV_DispatchThreadID) {
   uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                    xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 4u;
+  uint4 samples = xe_edram_load_store_source.Load4(rt_offset);
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 4u;
   xe_edram_load_store_dest.Store4(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
+      XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index), samples);
 }
diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl
index 7a91fe1b3..b6c13b3e3 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl
@@ -1,19 +1,14 @@
 #include "edram_load_store.hlsli"
 
-[numthreads(40, 8, 1)]
+[numthreads(40, 16, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
           uint3 xe_group_thread_id : SV_GroupThreadID,
           uint3 xe_thread_id : SV_DispatchThreadID) {
   uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                    xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
-  // One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
-  // from 1 render target row rather than 1. Threads with X 0-19 are for the
-  // first row, with 20-39 are for the second.
-  uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
-  [flatten] if (xe_group_thread_id.x >= 20u) {
-    tile_dword_index += uint2(uint(-80), 1u);
-  }
+  uint4 samples = xe_edram_load_store_source.Load4(rt_offset);
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 2u;
   xe_edram_load_store_dest.Store4(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
+      XeEDRAMOffset64bpp(xe_group_id.xy, tile_sample_index), samples);
 }
diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl
index 2beef5b4a..71d4e5f36 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl
@@ -7,13 +7,14 @@ void main(uint3 xe_group_id : SV_GroupID,
           uint3 xe_thread_id : SV_DispatchThreadID) {
   uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                    xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  uint4 pixels_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset);
-  uint4 pixel_0_f16u32 = pixels_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u);
-  uint4 pixel_1_f16u32 = pixels_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u);
-  uint2 pixels_7e3_packed =
-      uint2(XeFloat16To7e3(pixel_0_f16u32), XeFloat16To7e3(pixel_1_f16u32));
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 2u;
+  uint4 samples_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset);
+  uint4 sample_0_f16u32 = samples_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u);
+  uint4 sample_1_f16u32 = samples_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u);
+  uint2 samples_7e3_packed =
+      uint2(XeFloat16To7e3(sample_0_f16u32), XeFloat16To7e3(sample_1_f16u32));
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 2u;
   xe_edram_load_store_dest.Store2(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels_7e3_packed);
+      XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index),
+      samples_7e3_packed);
 }
diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl
index 4134240a4..2b7fd6ed4 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl
@@ -15,9 +15,9 @@ void main(uint3 xe_group_id : SV_GroupID,
               xe_edram_rt_stencil_offset;
   depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
                       uint4(0u, 8u, 16u, 24u)) & 0xFFu;
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 4u;
-  uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 4u;
+  uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
   // Store 24-bit depth for aliasing and checking if 32-bit depth is up to date.
   xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil);
   // Store 32-bit depth so precision isn't lost when doing multipass rendering.
diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl
index 010cef44b..9e07aa497 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl
@@ -7,15 +7,15 @@ void main(uint3 xe_group_id : SV_GroupID,
   // Depth.
   uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                    xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
-  uint4 pixels =
+  uint4 samples =
       (xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu) << 8u;
   // Stencil.
   rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
               xe_edram_rt_stencil_offset;
-  pixels |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
+  samples |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
              uint4(0u, 8u, 16u, 24u)) & 0xFFu;
-  uint2 tile_dword_index = xe_group_thread_id.xy;
-  tile_dword_index.x *= 4u;
+  uint2 tile_sample_index = xe_group_thread_id.xy;
+  tile_sample_index.x *= 4u;
   xe_edram_load_store_dest.Store4(
-      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
+      XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index), samples);
 }
diff --git a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl
index e4c86f62a..b309ebaeb 100644
--- a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl
@@ -23,7 +23,7 @@ void main(uint3 xe_group_id : SV_GroupID,
       (xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u;
   uint2 edram_tile_quarter =
       uint2(uint2(10u, 8u) <= xe_group_thread_id.xy) * sample_info.xy;
-  uint edram_offset = XeEDRAMOffset(
+  uint edram_offset = XeEDRAMOffset32bpp(
       (xe_group_id.xy << sample_info.xy) + edram_tile_quarter,
       (xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) <<
       (sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
diff --git a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_64bpp.cs.hlsl
new file mode 100644
index 000000000..c2d00c61d
--- /dev/null
+++ b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_64bpp.cs.hlsl
@@ -0,0 +1,67 @@
+#include "byte_swap.hlsli"
+#include "edram_load_store.hlsli"
+#include "texture_address.hlsli"
+
+[numthreads(20, 16, 1)]
+void main(uint3 xe_group_id : SV_GroupID,
+          uint3 xe_group_thread_id : SV_GroupThreadID,
+          uint3 xe_thread_id : SV_DispatchThreadID) {
+  // Check if not outside of the destination texture completely.
+  uint4 copy_rect;
+  copy_rect.xz = xe_edram_tile_sample_rect & 0xFFFFu;
+  copy_rect.yw = xe_edram_tile_sample_rect >> 16u;
+  uint2 texel_index = xe_thread_id.xy;
+  texel_index.x *= 4u;
+  [branch] if (any(texel_index < copy_rect.xy) ||
+               any(texel_index >= copy_rect.zw)) {
+    return;
+  }
+
+  // Get the samples from the EDRAM buffer.
+  // XY - log2(pixel size), ZW - selected sample offset.
+  uint4 sample_info =
+      (xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u;
+  uint2 edram_tile_quarter =
+      uint2(uint2(10u, 8u) <= xe_group_thread_id.xy) * sample_info.xy;
+  uint edram_offset = XeEDRAMOffset64bpp(
+      (xe_group_id.xy << sample_info.xy) + edram_tile_quarter,
+      (xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) <<
+      (sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
+  // Loaded with the first 2 pixels at 1x and 2x, or the first 1 pixel at 4x.
+  uint4 pixels_01 = xe_edram_load_store_source.Load4(edram_offset);
+  // Loaded with the second 2 pixels at 1x and 2x, or the second 1 pixel at 4x.
+  uint4 pixels_23 = xe_edram_load_store_source.Load4(edram_offset + 16u);
+  [branch] if (sample_info.x != 0u) {
+    // Rather than 4 pixels, at 4x, we only have 2 - in xy of each variable
+    // rather than in xyzw of pixels_01. Combine and load 2 more.
+    pixels_01.zw = pixels_23.xy;
+    pixels_23.xy = xe_edram_load_store_source.Load2(edram_offset + 32u);
+    pixels_23.zw = xe_edram_load_store_source.Load2(edram_offset + 48u);
+  }
+
+  if ((xe_edram_tile_sample_dest_info >> 21u) != 0u) {
+    // Swap red and blue - all 64bpp formats where this is possible are
+    // 16:16:16:16.
+    pixels_01 = (pixels_01 & 0xFFFF0000u) | (pixels_01.yxwz & 0xFFFFu);
+    pixels_23 = (pixels_23 & 0xFFFF0000u) | (pixels_23.yxwz & 0xFFFFu);
+  }
+
+  // Tile the pixels to the shared memory.
+  pixels_01 = XeByteSwap(pixels_01, xe_edram_tile_sample_dest_info >> 18u);
+  pixels_23 = XeByteSwap(pixels_23, xe_edram_tile_sample_dest_info >> 18u);
+  uint4 texel_addresses =
+      xe_edram_tile_sample_dest_base +
+      XeTextureTiledOffset2D(texel_index - copy_rect.xy,
+                             xe_edram_tile_sample_dest_info & 16383u, 3u);
+  xe_edram_load_store_dest.Store2(texel_addresses.x, pixels_01.xy);
+  bool3 texels_in_rect = uint3(1u, 2u, 3u) + texel_index.x < copy_rect.z;
+  [branch] if (texels_in_rect.x) {
+    xe_edram_load_store_dest.Store2(texel_addresses.y, pixels_01.zw);
+    [branch] if (texels_in_rect.y) {
+      xe_edram_load_store_dest.Store2(texel_addresses.z, pixels_23.xy);
+      [branch] if (texels_in_rect.z) {
+        xe_edram_load_store_dest.Store2(texel_addresses.w, pixels_23.zw);
+      }
+    }
+  }
+}