[D3D12] EDRAM storing and random cleanup

2018-08-11 20:33:33 +03:00 · 2018-08-11 20:33:33 +03:00 · 9b303c64ba
parent a4b98cda31
commit 9b303c64ba
17 changed files with 760 additions and 11 deletions
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -377,7 +377,7 @@ ID3D12Resource* D3D12CommandProcessor::RequestScratchGPUBuffer(
      barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
      barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
      barrier.Transition.pResource = scratch_buffer_;
-      barrier.Transition.Subresource = 0;
+      barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
      barrier.Transition.StateBefore = scratch_buffer_state_;
      barrier.Transition.StateAfter = state;
      GetCurrentCommandList()->ResourceBarrier(1, &barrier);
@ -489,6 +489,10 @@ bool D3D12CommandProcessor::SetupContext() {
  render_target_cache_ =
      std::make_unique<RenderTargetCache>(this, register_file_);
  if (!render_target_cache_->Initialize()) {
    XELOGE("Failed to initialize the render target cache");
    return false;
  }
  return true;
 }
--- a/src/xenia/gpu/d3d12/render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/render_target_cache.cc
@ -21,13 +21,176 @@ namespace xe {
 namespace gpu {
 namespace d3d12 {
 // Generated with `xb buildhlsl`.
 #include "xenia/gpu/d3d12/shaders/bin/edram_load_color_32bpp_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/edram_load_color_64bpp_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/edram_load_color_7e3_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/edram_load_depth_float_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/edram_load_depth_unorm_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/edram_store_color_32bpp_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/edram_store_color_64bpp_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/edram_store_color_7e3_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_float_cs.h"
 #include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_unorm_cs.h"
 const RenderTargetCache::EDRAMLoadStorePipelineInfo
    RenderTargetCache::edram_load_store_pipeline_info_[size_t(
        RenderTargetCache::EDRAMLoadStorePipelineIndex::kCount)] = {
        {edram_load_color_32bpp_cs, sizeof(edram_load_color_32bpp_cs),
         L"EDRAM Load 32bpp Color"},
        {edram_store_color_32bpp_cs, sizeof(edram_store_color_32bpp_cs),
         L"EDRAM Store 32bpp Color"},
        {edram_load_color_64bpp_cs, sizeof(edram_load_color_64bpp_cs),
         L"EDRAM Load 64bpp Color"},
        {edram_store_color_64bpp_cs, sizeof(edram_store_color_64bpp_cs),
         L"EDRAM Store 64bpp Color"},
        {edram_load_color_7e3_cs, sizeof(edram_load_color_7e3_cs),
         L"EDRAM Load 7e3 Color"},
        {edram_store_color_7e3_cs, sizeof(edram_store_color_7e3_cs),
         L"EDRAM Store 7e3 Color"},
        {edram_load_depth_unorm_cs, sizeof(edram_load_depth_unorm_cs),
         L"EDRAM Load UNorm Depth"},
        {edram_store_depth_unorm_cs, sizeof(edram_store_depth_unorm_cs),
         L"EDRAM Store UNorm Depth"},
        {edram_load_depth_float_cs, sizeof(edram_load_depth_float_cs),
         L"EDRAM Load Float Depth"},
        {edram_store_depth_float_cs, sizeof(edram_store_depth_float_cs),
         L"EDRAM Store Float Depth"},
 };
 RenderTargetCache::RenderTargetCache(D3D12CommandProcessor* command_processor,
                                     RegisterFile* register_file)
    : command_processor_(command_processor), register_file_(register_file) {}
 RenderTargetCache::~RenderTargetCache() { Shutdown(); }
-void RenderTargetCache::Shutdown() { ClearCache(); }
+bool RenderTargetCache::Initialize() {
  auto device =
      command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice();
  // Create the buffer for reinterpreting EDRAM contents.
  D3D12_RESOURCE_DESC edram_buffer_desc;
  edram_buffer_desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
  edram_buffer_desc.Alignment = 0;
  // First 10 MB is guest pixel data, second 10 MB is 32-bit depth when using
  // D24FS8 so loads/stores don't corrupt multipass rendering.
  edram_buffer_desc.Width = 2 * 2048 * 5120;
  edram_buffer_desc.Height = 1;
  edram_buffer_desc.DepthOrArraySize = 1;
  edram_buffer_desc.MipLevels = 1;
  edram_buffer_desc.Format = DXGI_FORMAT_UNKNOWN;
  edram_buffer_desc.SampleDesc.Count = 1;
  edram_buffer_desc.SampleDesc.Quality = 0;
  edram_buffer_desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
  edram_buffer_desc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
  D3D12_HEAP_PROPERTIES edram_buffer_heap_properties = {};
  edram_buffer_heap_properties.Type = D3D12_HEAP_TYPE_DEFAULT;
  // The first operation will be a clear.
  edram_buffer_state_ = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
  if (FAILED(device->CreateCommittedResource(
          &edram_buffer_heap_properties, D3D12_HEAP_FLAG_NONE,
          &edram_buffer_desc, edram_buffer_state_, nullptr,
          IID_PPV_ARGS(&edram_buffer_)))) {
    XELOGE("Failed to create the EDRAM buffer");
    return false;
  }
  edram_buffer_cleared_ = false;
  // Create the root signature for EDRAM buffer load/store.
  D3D12_ROOT_PARAMETER root_parameters[2];
  // Parameter 0 is constants (changed for each render target binding).
  root_parameters[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS;
  root_parameters[0].Constants.ShaderRegister = 0;
  root_parameters[0].Constants.RegisterSpace = 0;
  root_parameters[0].Constants.Num32BitValues =
      sizeof(EDRAMLoadStoreRootConstants) / sizeof(uint32_t);
  root_parameters[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
  // Parameter 1 is source and target.
  D3D12_DESCRIPTOR_RANGE root_load_store_ranges[2];
  root_load_store_ranges[0].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV;
  root_load_store_ranges[0].NumDescriptors = 1;
  root_load_store_ranges[0].BaseShaderRegister = 0;
  root_load_store_ranges[0].RegisterSpace = 0;
  root_load_store_ranges[0].OffsetInDescriptorsFromTableStart = 0;
  root_load_store_ranges[1].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_UAV;
  root_load_store_ranges[1].NumDescriptors = 1;
  root_load_store_ranges[1].BaseShaderRegister = 0;
  root_load_store_ranges[1].RegisterSpace = 0;
  root_load_store_ranges[1].OffsetInDescriptorsFromTableStart = 1;
  root_parameters[1].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
  root_parameters[1].DescriptorTable.NumDescriptorRanges = 2;
  root_parameters[1].DescriptorTable.pDescriptorRanges = root_load_store_ranges;
  root_parameters[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
  D3D12_ROOT_SIGNATURE_DESC root_signature_desc;
  root_signature_desc.NumParameters = UINT(xe::countof(root_parameters));
  root_signature_desc.pParameters = root_parameters;
  root_signature_desc.NumStaticSamplers = 0;
  root_signature_desc.pStaticSamplers = nullptr;
  root_signature_desc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE;
  ID3DBlob* root_signature_blob;
  ID3DBlob* root_signature_error_blob = nullptr;
  if (FAILED(D3D12SerializeRootSignature(
          &root_signature_desc, D3D_ROOT_SIGNATURE_VERSION_1,
          &root_signature_blob, &root_signature_error_blob))) {
    XELOGE("Failed to serialize the EDRAM buffer load/store root signature");
    if (root_signature_error_blob != nullptr) {
      XELOGE("%s", reinterpret_cast<const char*>(
                       root_signature_error_blob->GetBufferPointer()));
      root_signature_error_blob->Release();
    }
    Shutdown();
    return false;
  }
  if (root_signature_error_blob != nullptr) {
    root_signature_error_blob->Release();
  }
  if (FAILED(device->CreateRootSignature(
          0, root_signature_blob->GetBufferPointer(),
          root_signature_blob->GetBufferSize(),
          IID_PPV_ARGS(&edram_load_store_root_signature_)))) {
    XELOGE("Failed to create the EDRAM buffer load/store root signature");
    root_signature_blob->Release();
    Shutdown();
    return false;
  }
  root_signature_blob->Release();
  // Create the load/store pipelines.
  D3D12_COMPUTE_PIPELINE_STATE_DESC pipeline_desc;
  pipeline_desc.pRootSignature = edram_load_store_root_signature_;
  pipeline_desc.NodeMask = 0;
  pipeline_desc.CachedPSO.pCachedBlob = nullptr;
  pipeline_desc.CachedPSO.CachedBlobSizeInBytes = 0;
  pipeline_desc.Flags = D3D12_PIPELINE_STATE_FLAG_NONE;
  for (uint32_t i = 0; i < uint32_t(EDRAMLoadStorePipelineIndex::kCount); ++i) {
    const EDRAMLoadStorePipelineInfo& pipeline_info =
        edram_load_store_pipeline_info_[i];
    pipeline_desc.CS.pShaderBytecode = pipeline_info.shader;
    pipeline_desc.CS.BytecodeLength = pipeline_info.shader_size;
    if (FAILED(device->CreateComputePipelineState(
            &pipeline_desc, IID_PPV_ARGS(&edram_load_store_pipelines_[i])))) {
      XELOGE("Failed to create EDRAM load/store pipeline for mode %u", i);
      Shutdown();
      return false;
    }
  }
  return true;
 }
 void RenderTargetCache::Shutdown() {
  ClearCache();
  if (edram_load_store_root_signature_ != nullptr) {
    edram_load_store_root_signature_->Release();
    edram_load_store_root_signature_ = nullptr;
  }
  if (edram_buffer_ != nullptr) {
    edram_buffer_->Release();
    edram_buffer_ = nullptr;
  }
 }
 void RenderTargetCache::ClearCache() {
  for (auto render_target_pair : render_targets_) {
@ -334,7 +497,7 @@ bool RenderTargetCache::UpdateRenderTargets() {
    uint32_t heap_usage[5] = {};
    if (full_update) {
      // Export the currently bound render targets before we ruin the bindings.
-      WriteRenderTargetsToEDRAM();
+      StoreRenderTargetsToEDRAM();
      ClearBindings();
      current_surface_pitch_ = surface_pitch;
@ -527,7 +690,7 @@ bool RenderTargetCache::UpdateRenderTargets() {
 }
 void RenderTargetCache::EndFrame() {
-  WriteRenderTargetsToEDRAM();
+  StoreRenderTargetsToEDRAM();
  ClearBindings();
 }
@ -709,6 +872,7 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
  }
  ++descriptor_heap->descriptors_used;
  // Get the layout for copying to the EDRAM buffer.
  RenderTarget* render_target = new RenderTarget;
  render_target->resource = resource;
  render_target->state = state;
@ -716,11 +880,245 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
  render_target->key = key;
  render_target->heap_page_first = heap_page_first;
  render_target->heap_page_count = heap_page_count;
  UINT64 copy_buffer_size;
  device->GetCopyableFootprints(&resource_desc, 0, key.is_depth ? 2 : 1, 0,
                                render_target->footprints, nullptr, nullptr,
                                &copy_buffer_size);
  render_target->copy_buffer_size = uint32_t(copy_buffer_size);
  render_targets_.insert(std::make_pair(key.value, render_target));
  return render_target;
 }
-void RenderTargetCache::WriteRenderTargetsToEDRAM() {}
+void RenderTargetCache::StoreRenderTargetsToEDRAM() {
  auto command_list = command_processor_->GetCurrentCommandList();
  if (command_list == nullptr) {
    return;
  }
  uint32_t surface_pitch_ss =
      current_surface_pitch_ *
      (current_msaa_samples_ >= MsaaSamples::k4X ? 2 : 1);
  uint32_t surface_pitch_tiles = (surface_pitch_ss + 79) / 80;
  assert_true(surface_pitch_tiles != 0);
  // TODO(Triang3l): Clear the buffer if calling for the first time.
  uint32_t store_bindings[5];
  uint32_t store_binding_count = 0;
  D3D12_RESOURCE_BARRIER barriers[6];
  uint32_t barrier_count;
  // Extract only the render targets that need to be stored, transition them to
  // copy sources and calculate intermediate buffer size.
  uint32_t copy_buffer_size = 0;
  barrier_count = 0;
  for (uint32_t i = 0; i < 5; ++i) {
    const RenderTargetBinding& binding = current_bindings_[i];
    RenderTarget* render_target = binding.render_target;
    // TODO(Triang3l): Change edram_dirty_length to dirty row count.
    if (!binding.is_bound || render_target == nullptr ||
        binding.edram_dirty_length < surface_pitch_tiles) {
      continue;
    }
    store_bindings[store_binding_count] = i;
    copy_buffer_size =
        std::max(copy_buffer_size, render_target->copy_buffer_size);
    ++store_binding_count;
    if (render_target->state != D3D12_RESOURCE_STATE_COPY_SOURCE) {
      D3D12_RESOURCE_BARRIER& barrier = barriers[barrier_count++];
      barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
      barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
      barrier.Transition.pResource = render_target->resource;
      barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
      barrier.Transition.StateBefore = render_target->state;
      barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
      render_target->state = D3D12_RESOURCE_STATE_COPY_SOURCE;
    }
  }
  if (store_binding_count == 0) {
    return;
  }
  if (edram_buffer_state_ != D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
    // Also transition the EDRAM buffer to UAV.
    D3D12_RESOURCE_BARRIER& barrier = barriers[barrier_count++];
    barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
    barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
    barrier.Transition.pResource = edram_buffer_;
    barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
    barrier.Transition.StateBefore = edram_buffer_state_;
    barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
    edram_buffer_state_ = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
  }
  if (barrier_count != 0) {
    command_list->ResourceBarrier(barrier_count, barriers);
  }
  // Allocate descriptors for the buffers.
  D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start;
  D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start;
  if (command_processor_->RequestViewDescriptors(0, 2, 2, descriptor_cpu_start,
                                                 descriptor_gpu_start) == 0) {
    return;
  }
  // Get the buffer for copying.
  D3D12_RESOURCE_STATES copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST;
  ID3D12Resource* copy_buffer = command_processor_->RequestScratchGPUBuffer(
      copy_buffer_size, copy_buffer_state);
  if (copy_buffer == nullptr) {
    return;
  }
  // Prepare for writing.
  auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
  auto device = provider->GetDevice();
  auto descriptor_size_view = provider->GetDescriptorSizeView();
  D3D12_SHADER_RESOURCE_VIEW_DESC srv_desc;
  srv_desc.Format = DXGI_FORMAT_R32_TYPELESS;
  srv_desc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
  srv_desc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
  srv_desc.Buffer.FirstElement = 0;
  srv_desc.Buffer.NumElements = copy_buffer_size >> 2;
  srv_desc.Buffer.StructureByteStride = 0;
  srv_desc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW;
  device->CreateShaderResourceView(copy_buffer, &srv_desc,
                                   descriptor_cpu_start);
  D3D12_UNORDERED_ACCESS_VIEW_DESC uav_desc;
  uav_desc.Format = DXGI_FORMAT_R32_TYPELESS;
  uav_desc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
  uav_desc.Buffer.FirstElement = 0;
  uav_desc.Buffer.NumElements = 2 * 2048 * 1280;
  uav_desc.Buffer.StructureByteStride = 0;
  uav_desc.Buffer.CounterOffsetInBytes = 0;
  uav_desc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
  D3D12_CPU_DESCRIPTOR_HANDLE uav_cpu_handle;
  uav_cpu_handle.ptr = descriptor_cpu_start.ptr + descriptor_size_view;
  device->CreateUnorderedAccessView(edram_buffer_, nullptr, &uav_desc,
                                    uav_cpu_handle);
  command_list->SetComputeRootSignature(edram_load_store_root_signature_);
  command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
  // Sort the bindings in ascending order of EDRAM base so data in the render
  // targets placed farther in EDRAM isn't lost in case of overlap.
  std::sort(
      store_bindings, store_bindings + store_binding_count,
      [this](uint32_t a, uint32_t b) {
        if (current_bindings_[a].edram_base < current_bindings_[b].edram_base) {
          return true;
        }
        return a < b;
      });
  // Store each render target.
  for (uint32_t i = 0; i < store_binding_count; ++i) {
    const RenderTargetBinding& binding = current_bindings_[store_bindings[i]];
    const RenderTarget* render_target = binding.render_target;
    EDRAMLoadStorePipelineIndex pipeline_index;
    bool is_64bpp = false;
    if (render_target->key.is_depth) {
      if (DepthRenderTargetFormat(render_target->key.format) ==
          DepthRenderTargetFormat::kD24FS8) {
        pipeline_index = EDRAMLoadStorePipelineIndex::kDepthFloatStore;
      } else {
        pipeline_index = EDRAMLoadStorePipelineIndex::kDepthUnormStore;
      }
    } else {
      switch (ColorRenderTargetFormat(render_target->key.format)) {
        case ColorRenderTargetFormat::k_8_8_8_8:
        case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
        case ColorRenderTargetFormat::k_2_10_10_10:
        case ColorRenderTargetFormat::k_16_16:
        case ColorRenderTargetFormat::k_16_16_FLOAT:
        case ColorRenderTargetFormat::k_2_10_10_10_AS_16_16_16_16:
        case ColorRenderTargetFormat::k_32_FLOAT:
          pipeline_index = EDRAMLoadStorePipelineIndex::kColor32bppStore;
          break;
        case ColorRenderTargetFormat::k_16_16_16_16:
        case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
        case ColorRenderTargetFormat::k_32_32_FLOAT:
          pipeline_index = EDRAMLoadStorePipelineIndex::kColor64bppStore;
          is_64bpp = true;
          break;
        case ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
        case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
          pipeline_index = EDRAMLoadStorePipelineIndex::kColor7e3Store;
          break;
        default:
          assert_unhandled_case(render_target->key.format);
          continue;
      }
    }
    D3D12_TEXTURE_COPY_LOCATION location_source, location_dest;
    location_source.pResource = render_target->resource;
    location_source.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
    location_source.SubresourceIndex = 0;
    location_dest.pResource = copy_buffer;
    location_dest.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
    location_dest.PlacedFootprint = render_target->footprints[0];
    // TODO(Triang3l): Box for color render targets.
    command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
                                    nullptr);
    EDRAMLoadStoreRootConstants root_constants;
    root_constants.base_tiles = binding.edram_base;
    root_constants.pitch_tiles = surface_pitch_tiles * (is_64bpp ? 2 : 1);
    root_constants.rt_color_depth_pitch =
        location_dest.PlacedFootprint.Footprint.RowPitch;
    if (render_target->key.is_depth) {
      location_source.SubresourceIndex = 1;
      location_dest.PlacedFootprint = render_target->footprints[1];
      command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
                                      nullptr);
      root_constants.rt_stencil_offset =
          uint32_t(location_dest.PlacedFootprint.Offset);
      root_constants.rt_stencil_pitch =
          location_dest.PlacedFootprint.Footprint.RowPitch;
    }
    // Transition the copy buffer to SRV.
    barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
    barriers[0].Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
    barriers[0].Transition.pResource = copy_buffer;
    barriers[0].Transition.Subresource =
        D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
    barriers[0].Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST;
    barriers[0].Transition.StateAfter =
        D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
    copy_buffer_state = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
    command_list->ResourceBarrier(1, barriers);
    // Store the data.
    command_list->SetComputeRoot32BitConstants(
        0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
    command_processor_->SetPipeline(
        edram_load_store_pipelines_[size_t(pipeline_index)]);
    command_list->Dispatch(
        root_constants.pitch_tiles,
        binding.edram_dirty_length / root_constants.pitch_tiles, 1);
    // Commit the UAV write and prepare for copying again.
    barrier_count = 1;
    barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_UAV;
    barriers[0].Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
    barriers[0].UAV.pResource = edram_buffer_;
    if (i + 1 < store_binding_count) {
      barrier_count = 2;
      barriers[1].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
      barriers[1].Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
      barriers[1].Transition.pResource = copy_buffer;
      barriers[1].Transition.Subresource =
          D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
      barriers[1].Transition.StateBefore =
          D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
      barriers[1].Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_DEST;
      copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST;
    }
    command_list->ResourceBarrier(barrier_count, barriers);
  }
  command_processor_->ReleaseScratchGPUBuffer(copy_buffer, copy_buffer_state);
 }
 }  // namespace d3d12
 }  // namespace gpu
--- a/src/xenia/gpu/d3d12/render_target_cache.h
+++ b/src/xenia/gpu/d3d12/render_target_cache.h
@ -201,6 +201,7 @@ class RenderTargetCache {
                    RegisterFile* register_file);
  ~RenderTargetCache();
  bool Initialize();
  void Shutdown();
  void ClearCache();
@ -233,6 +234,27 @@ class RenderTargetCache {
  }
 private:
  enum class EDRAMLoadStorePipelineIndex {
    kColor32bppLoad,
    kColor32bppStore,
    kColor64bppLoad,
    kColor64bppStore,
    kColor7e3Load,
    kColor7e3Store,
    kDepthUnormLoad,
    kDepthUnormStore,
    kDepthFloatLoad,
    kDepthFloatStore,
    kCount
  };
  struct EDRAMLoadStorePipelineInfo {
    const void* shader;
    size_t shader_size;
    const WCHAR* name;
  };
  union RenderTargetKey {
    struct {
      // Supersampled (_ss - scaled 2x if needed) dimensions, divided by 80x16.
@ -267,8 +289,12 @@ class RenderTargetCache {
    RenderTargetKey key;
    // The first 4 MB page in the heaps.
    uint32_t heap_page_first;
-    // Number of 4 MB pages this render target uses.
+    // The number of 4 MB pages this render target uses.
    uint32_t heap_page_count;
    // Color/depth and stencil layouts.
    D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprints[2];
    // Buffer size needed to copy the render target to the EDRAM buffer.
    uint32_t copy_buffer_size;
  };
  struct RenderTargetBinding {
@ -294,13 +320,34 @@ class RenderTargetCache {
  RenderTarget* FindOrCreateRenderTarget(RenderTargetKey key,
                                         uint32_t heap_page_first);
-  // Must be in a frame to call. Writes the dirty areas of the currently bound
+  // Must be in a frame to call. Stores the dirty areas of the currently bound
  // render targets and marks them as clean.
-  void WriteRenderTargetsToEDRAM();
+  void StoreRenderTargetsToEDRAM();
  D3D12CommandProcessor* command_processor_;
  RegisterFile* register_file_;
  // The EDRAM buffer allowing color and depth data to be reinterpreted.
  ID3D12Resource* edram_buffer_ = nullptr;
  D3D12_RESOURCE_STATES edram_buffer_state_;
  bool edram_buffer_cleared_;
  // EDRAM buffer load/store root signature.
  ID3D12RootSignature* edram_load_store_root_signature_ = nullptr;
  struct EDRAMLoadStoreRootConstants {
    uint32_t base_tiles;
    uint32_t pitch_tiles;
    uint32_t rt_color_depth_pitch;
    uint32_t rt_stencil_offset;
    uint32_t rt_stencil_pitch;
  };
  // EDRAM buffer load/store pipelines.
  static const EDRAMLoadStorePipelineInfo
      edram_load_store_pipeline_info_[size_t(
          EDRAMLoadStorePipelineIndex::kCount)];
  ID3D12PipelineState* edram_load_store_pipelines_[size_t(
      EDRAMLoadStorePipelineIndex::kCount)] = {};
  // 32 MB heaps backing used render targets resources, created when needed.
  // 24 MB proved to be not enough to store a single render target occupying the
  // entire EDRAM - a 32-bit depth/stencil one - at some resolution.
--- a/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl
@ -0,0 +1,14 @@
 #include "edram_load_store.hlsli"
 [numthreads(20, 16, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  uint2 tile_dword_index = xe_group_thread_id.xy;
  tile_dword_index.x *= 4u;
  uint4 pixels = xe_edram_load_store_source.Load4(
      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u;
  xe_edram_load_store_dest.Store4(rt_offset, pixels);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl
@ -0,0 +1,19 @@
 #include "edram_load_store.hlsli"
 [numthreads(40, 8, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  // One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
  // from 1 render target row rather than 1. Threads with X 0-19 are for the
  // first row, with 20-39 are for the second.
  uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
  [flatten] if (xe_group_thread_id.x >= 20u) {
    tile_dword_index += uint2(uint(-80), 1u);
  }
  uint4 pixels = xe_edram_load_store_source.Load4(
      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u;
  xe_edram_load_store_dest.Store4(rt_offset, pixels);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl
@ -0,0 +1,20 @@
 #include "edram_load_store.hlsli"
 #include "pixel_formats.hlsli"
 [numthreads(40, 16, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  uint2 tile_dword_index = xe_group_thread_id.xy;
  tile_dword_index.x *= 2u;
  uint2 pixels_7e3_packed = xe_edram_load_store_source.Load2(
      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
  uint4 pixel_0_f16u32 = XeFloat7e3To16(pixels_7e3_packed.x);
  uint4 pixel_1_f16u32 = XeFloat7e3To16(pixels_7e3_packed.y);
  uint4 pixels_f16u32_packed =
      uint4(pixel_0_f16u32.xz, pixel_1_f16u32.xz) |
      (uint4(pixel_0_f16u32.yw, pixel_1_f16u32.yw) << 16u);
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u;
  xe_edram_load_store_dest.Store4(rt_offset, pixels_f16u32_packed);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl
@ -0,0 +1,31 @@
 #include "edram_load_store.hlsli"
 #include "pixel_formats.hlsli"
 [numthreads(20, 16, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  uint2 tile_dword_index = xe_group_thread_id.xy;
  tile_dword_index.x *= 4u;
  uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
  uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset);
  uint4 depth24 = depth24_stencil & 0xFFFFFFu;
  uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset);
  // Depth. If the stored 32-bit depth converted to 24-bit is the same as the
  // stored 24-bit depth, load the 32-bit value because it has more precision
  // (and multipass rendering is possible), if it's not, convert the 24-bit
  // depth because it was overwritten by aliasing.
  uint4 depth24to32 = XeFloat20e4To32(depth24);
  uint4 depth = depth24to32 + (depth32 - depth24to32) *
                uint4(XeFloat32To20e4(depth32) == depth24);
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u;
  xe_edram_load_store_dest.Store4(rt_offset, depth);
  // Stencil.
  uint4 stencil = (depth24_stencil >> 24u) << uint4(0u, 8u, 16u, 24u);
  stencil.xy |= stencil.zw;
  stencil.x |= stencil.y;
  rt_offset = xe_edram_rt_stencil_offset +
              xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
  xe_edram_load_store_dest.Store(rt_offset, stencil.x);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl
@ -0,0 +1,22 @@
 #include "edram_load_store.hlsli"
 [numthreads(20, 16, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  uint2 tile_dword_index = xe_group_thread_id.xy;
  tile_dword_index.x *= 4u;
  uint4 pixels = xe_edram_load_store_source.Load4(
      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
  // Depth.
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u;
  xe_edram_load_store_dest.Store4(rt_offset, pixels & 0xFFFFFFu);
  // Stencil.
  uint4 stencil = (pixels >> 24u) << uint4(0u, 8u, 16u, 24u);
  stencil.xy |= stencil.zw;
  stencil.x |= stencil.y;
  rt_offset = xe_edram_rt_stencil_offset +
              xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
  xe_edram_load_store_dest.Store(rt_offset, stencil.x);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
@ -0,0 +1,21 @@
 #ifndef XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
 #define XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
 cbuffer XeEDRAMLoadStoreConstants : register(b0) {
  uint xe_edram_base_tiles;
  uint xe_edram_pitch_tiles;
  uint xe_edram_rt_color_depth_pitch;
  uint xe_edram_rt_stencil_offset;
  uint xe_edram_rt_stencil_pitch;
 };
 ByteAddressBuffer xe_edram_load_store_source : register(t0);
 RWByteAddressBuffer xe_edram_load_store_dest : register(u0);
 uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) {
  return (xe_edram_base_tiles + (tile_index.y * xe_edram_pitch_tiles) +
          tile_index.x) * 5120u + tile_dword_index.y * 320u +
         tile_dword_index.x * 4u;
 }
 #endif  // XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
--- a/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl
@ -0,0 +1,14 @@
 #include "edram_load_store.hlsli"
 [numthreads(20, 16, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u;
  uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
  uint2 tile_dword_index = xe_group_thread_id.xy;
  tile_dword_index.x *= 4u;
  xe_edram_load_store_dest.Store4(
      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl
@ -0,0 +1,19 @@
 #include "edram_load_store.hlsli"
 [numthreads(40, 8, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u;
  uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
  // One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
  // from 1 render target row rather than 1. Threads with X 0-19 are for the
  // first row, with 20-39 are for the second.
  uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
  [flatten] if (xe_group_thread_id.x >= 20u) {
    tile_dword_index += uint2(uint(-80), 1u);
  }
  xe_edram_load_store_dest.Store4(
      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl
@ -0,0 +1,19 @@
 #include "edram_load_store.hlsli"
 #include "pixel_formats.hlsli"
 [numthreads(40, 16, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u;
  uint4 pixels_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset);
  uint4 pixel_0_f16u32 = pixels_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u);
  uint4 pixel_1_f16u32 = pixels_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u);
  uint2 pixels_7e3_packed =
      uint2(XeFloat16To7e3(pixel_0_f16u32), XeFloat16To7e3(pixel_1_f16u32));
  uint2 tile_dword_index = xe_group_thread_id.xy;
  tile_dword_index.x *= 2u;
  xe_edram_load_store_dest.Store2(
      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels_7e3_packed);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl
@ -0,0 +1,25 @@
 #include "edram_load_store.hlsli"
 #include "pixel_formats.hlsli"
 [numthreads(20, 16, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  // Depth.
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u;
  uint4 depth32 = xe_edram_load_store_source.Load4(rt_offset);
  uint4 depth24_stencil = XeFloat32To20e4(depth32);
  // Stencil.
  rt_offset = xe_edram_rt_stencil_offset +
              xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
  depth24_stencil |= xe_edram_load_store_source.Load(rt_offset).xxxx >>
                     uint4(0u, 8u, 16u, 24u) << 24u;
  uint2 tile_dword_index = xe_group_thread_id.xy;
  tile_dword_index.x *= 4u;
  uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
  // Store 24-bit depth for aliasing and checking if 32-bit depth is up to date.
  xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil);
  // Store 32-bit depth so precision isn't lost when doing multipass rendering.
  xe_edram_load_store_dest.Store4(10485760u + edram_offset, depth32);
 }
--- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl
@ -0,0 +1,20 @@
 #include "edram_load_store.hlsli"
 [numthreads(20, 16, 1)]
 void main(uint3 xe_group_id : SV_GroupID,
          uint3 xe_group_thread_id : SV_GroupThreadID,
          uint3 xe_thread_id : SV_DispatchThreadID) {
  // Depth.
  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
                   xe_thread_id.x * 16u;
  uint4 pixels = xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu;
  // Stencil.
  rt_offset = xe_edram_rt_stencil_offset +
              xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
  pixels |= xe_edram_load_store_source.Load(rt_offset).xxxx >>
            uint4(0u, 8u, 16u, 24u) << 24u;
  uint2 tile_dword_index = xe_group_thread_id.xy;
  tile_dword_index.x *= 4u;
  xe_edram_load_store_dest.Store4(
      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
 }
--- a/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli
+++ b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli
@ -0,0 +1,74 @@
 #ifndef XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_
 #define XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_
 // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
 uint XeFloat16To7e3(uint4 rgba_f16u32) {
  float4 rgba_f32 = f16tof32(rgba_f16u32);
  uint3 rgb_f32u32 = asuint(rgba_f32.xyz);
  // Keep only positive (high bit set means negative for both float and int) and
  // saturate to 31.875 (also dropping NaNs).
  rgb_f32u32 = uint3(clamp(int3(rgb_f32u32), 0, 0x41FF0000));
  uint3 normalized = rgb_f32u32 + 0xC2000000u;
  uint3 denormalized = ((rgb_f32u32 & 0x7FFFFFu) | 0x800000u) >>
                       ((125u).xxx - (rgb_f32u32 >> 23u));
  uint3 rgb_f10u32 = normalized + (denormalized - normalized) *
                     uint3(rgb_f32u32 < 0x3E800000u);
  rgb_f10u32 =
      ((rgb_f10u32 + 0x7FFFu + ((rgb_f10u32 >> 16u) & 1u)) >> 16u) & 0x3FFu;
  return rgb_f10u32.r | (rgb_f10u32.g << 10u) | (rgb_f10u32.b << 20u) |
         (uint(saturate(rgba_f32.a) * 3.0) << 30u);
 }
 uint4 XeFloat7e3To16(uint rgba_packed) {
  uint3 rgb_f10u32 = (rgba_packed.xxx >> uint3(0u, 10u, 20u)) & 0x3FFu;
  uint3 mantissa = rgb_f10u32 & 0x7Fu;
  uint3 exponent = rgb_f10u32 >> 7u;
  // Normalize the values for the denormalized components.
  // Exponent = 1;
  // do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x80) == 0);
  uint3 is_denormalized = uint3(exponent == 0u);
  uint3 mantissa_lzcnt = (7u).xxx - firstbithigh(mantissa);
  exponent += ((1u).xxx - mantissa_lzcnt - exponent) * is_denormalized;
  mantissa +=
      (((mantissa << mantissa_lzcnt) & 0x7Fu) - mantissa) * is_denormalized;
  // Combine into 32-bit float bits and clear zeros.
  uint3 rgb_f32u32 = (((exponent + 124u) << 23u) | (mantissa << 16u)) *
                     uint3(rgb_f10u32 != 0u);
  return f32tof16(float4(asfloat(rgb_f32u32),
                         float(rgba_packed >> 30u) * (1.0 / 3.0)));
 }
 // Based on CFloat24 from d3dref9.dll and the 6e4 code from:
 // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
 // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
 // We also can't clamp the stored value to 1 as load->store->load must be exact.
 uint4 XeFloat32To20e4(uint4 f32u32) {
  // Keep only positive (high bit set means negative for both float and int) and
  // saturate to the maximum representable value near 2 (also dropping NaNs).
  f32u32 = uint4(clamp(int4(f32u32), 0, 0x3FFFFFF8));
  uint4 normalized = f32u32 + 0xC8000000u;
  uint4 denormalized =
      ((f32u32 & 0x7FFFFFu) | 0x800000u) >> ((113u).xxxx - (f32u32 >> 23u));
  uint4 f24u32 =
      normalized + (denormalized - normalized) * uint4(f32u32 < 0x38800000u);
  return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu;
 }
 uint4 XeFloat20e4To32(uint4 f24u32) {
  uint4 mantissa = f24u32 & 0xF00000u;
  uint4 exponent = f24u32 >> 20u;
  // Normalize the values for the denormalized components.
  // Exponent = 1;
  // do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x100000) == 0);
  uint4 is_denormalized = uint4(exponent == 0u);
  uint4 mantissa_lzcnt = (20u).xxxx - firstbithigh(mantissa);
  exponent += ((1u).xxxx - mantissa_lzcnt - exponent) * is_denormalized;
  mantissa +=
      (((mantissa << mantissa_lzcnt) & 0xFFFFFu) - mantissa) * is_denormalized;
  // Combine into 32-bit float bits and clear zeros.
  return (((exponent + 112u) << 23u) | (mantissa << 3u)) * uint4(f24u32 != 0u);
 }
 #endif  // XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_
--- a/src/xenia/gpu/d3d12/shared_memory.cc
+++ b/src/xenia/gpu/d3d12/shared_memory.cc
@ -394,7 +394,7 @@ void SharedMemory::TransitionBuffer(D3D12_RESOURCE_STATES new_state,
  barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
  barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
  barrier.Transition.pResource = buffer_;
-  barrier.Transition.Subresource = 0;
+  barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
  barrier.Transition.StateBefore = buffer_state_;
  barrier.Transition.StateAfter = new_state;
  command_list->ResourceBarrier(1, &barrier);
--- a/src/xenia/gpu/d3d12/texture_cache.cc
+++ b/src/xenia/gpu/d3d12/texture_cache.cc
@ -741,7 +741,8 @@ bool TextureCache::LoadTextureData(Texture* texture) {
    if (copy_buffer_state != D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
      barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
      barriers[0].Transition.pResource = copy_buffer;
-      barriers[0].Transition.Subresource = 0;
+      barriers[0].Transition.Subresource =
          D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
      barriers[0].Transition.StateBefore = copy_buffer_state;
      barriers[0].Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
      command_list->ResourceBarrier(1, barriers);
@ -792,7 +793,8 @@ bool TextureCache::LoadTextureData(Texture* texture) {
    barriers[0].UAV.pResource = copy_buffer;
    barriers[1].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
    barriers[1].Transition.pResource = copy_buffer;
-    barriers[1].Transition.Subresource = 0;
+    barriers[1].Transition.Subresource =
        D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
    barriers[1].Transition.StateBefore = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
    barriers[1].Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
    command_list->ResourceBarrier(2, barriers);