diff --git a/src/xenia/base/filesystem_posix.cc b/src/xenia/base/filesystem_posix.cc
index 2e9ddb2c5..193e637ea 100644
--- a/src/xenia/base/filesystem_posix.cc
+++ b/src/xenia/base/filesystem_posix.cc
@@ -217,6 +217,10 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
   }
 
   while (auto ent = readdir(dir)) {
+    if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
+      continue;
+    }
+
     FileInfo info;
 
     info.name = ent->d_name;
@@ -225,6 +229,7 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
     info.create_timestamp = convertUnixtimeToWinFiletime(st.st_ctime);
     info.access_timestamp = convertUnixtimeToWinFiletime(st.st_atime);
     info.write_timestamp = convertUnixtimeToWinFiletime(st.st_mtime);
+    info.path = path;
     if (ent->d_type == DT_DIR) {
       info.type = FileInfo::Type::kDirectory;
       info.total_size = 0;
@@ -234,7 +239,7 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
     }
     result.push_back(info);
   }
-
+  closedir(dir);
   return result;
 }
 
diff --git a/src/xenia/base/utf8.cc b/src/xenia/base/utf8.cc
index 65f798f54..a96d6b194 100644
--- a/src/xenia/base/utf8.cc
+++ b/src/xenia/base/utf8.cc
@@ -10,6 +10,7 @@
 #include "xenia/base/utf8.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <locale>
 #include <numeric>
 #include <tuple>
diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index b9cb88869..791b9a87d 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -481,6 +481,43 @@ struct VECTOR_COMPARE_UGT_V128
     : Sequence<VECTOR_COMPARE_UGT_V128,
                I<OPCODE_VECTOR_COMPARE_UGT, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW |
+                           kX64EmitAVX512DQ) &&
+        (i.instr->flags != FLOAT32_TYPE)) {
+      Xmm src1 = e.xmm0;
+      if (i.src1.is_constant) {
+        e.LoadConstantXmm(src1, i.src1.constant());
+      } else {
+        src1 = i.src1;
+      }
+
+      Xmm src2 = e.xmm1;
+      if (i.src2.is_constant) {
+        e.LoadConstantXmm(src2, i.src2.constant());
+      } else {
+        src2 = i.src2;
+      }
+
+      switch (i.instr->flags) {
+        case INT8_TYPE:
+          e.vpcmpub(e.k1, src1, src2, 0x6);
+          e.vpmovm2b(i.dest, e.k1);
+          break;
+        case INT16_TYPE:
+          e.vpcmpuw(e.k1, src1, src2, 0x6);
+          e.vpmovm2w(i.dest, e.k1);
+          break;
+        case INT32_TYPE:
+          e.vpcmpud(e.k1, src1, src2, 0x6);
+          e.vpmovm2d(i.dest, e.k1);
+          break;
+        default:
+          assert_always();
+          break;
+      }
+      return;
+    }
+
     Xbyak::Address sign_addr = e.ptr[e.rax];  // dummy
     switch (i.instr->flags) {
       case INT8_TYPE:
diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
index 4c1640302..bc4e91287 100644
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@@ -646,8 +646,9 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
           break;
         case OPCODE_AND_NOT:
           if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            v->set_from(i->src1.value);
-            v->AndNot(i->src2.value);
+            v->set_from(i->src2.value);
+            v->Not();
+            v->And(i->src1.value);
             i->UnlinkAndNOP();
             result = true;
           }
diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
index 0ea2fb4ad..15db03282 100644
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@@ -324,8 +324,13 @@ int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
 }
 
 int InstrEmit_vaddcuw(PPCHIRBuilder& f, const InstrData& i) {
-  XEINSTRNOTIMPLEMENTED();
-  return 1;
+  Value* sum = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE,
+                           ARITHMETIC_UNSIGNED);
+  Value* overflow = f.VectorCompareUGT(f.LoadVR(i.VX.VA), sum, INT32_TYPE);
+  Value* carry =
+      f.VectorShr(overflow, f.LoadConstantVec128(vec128i(31)), INT32_TYPE);
+  f.StoreVR(i.VX.VD, carry);
+  return 0;
 }
 
 int InstrEmit_vaddfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb) {
@@ -1665,7 +1670,11 @@ int InstrEmit_vsrw128(PPCHIRBuilder& f, const InstrData& i) {
 }
 
 int InstrEmit_vsubcuw(PPCHIRBuilder& f, const InstrData& i) {
-  XEINSTRNOTIMPLEMENTED();
+  Value* underflow =
+      f.VectorCompareUGE(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE);
+  Value* borrow =
+      f.VectorShr(underflow, f.LoadConstantVec128(vec128i(31)), INT32_TYPE);
+  f.StoreVR(i.VX.VD, borrow);
   return 1;
 }
 
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 28c1b214d..3da3bfbde 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -2574,7 +2574,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     return false;
   }
   pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
-  const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
+
+  const bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0;
 
   // Pixel shader analysis.
   bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
@@ -2604,7 +2605,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   }
 
   const bool memexport_used_pixel =
-      pixel_shader && pixel_shader->is_valid_memexport_used();
+      pixel_shader && (pixel_shader->memexport_eM_written() != 0);
   const bool memexport_used = memexport_used_vertex || memexport_used_pixel;
 
   if (!BeginSubmission(true)) {
@@ -2831,12 +2832,22 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   // Gather memexport ranges and ensure the heaps for them are resident, and
   // also load the data surrounding the export and to fill the regions that
   // won't be modified by the shaders.
-
-  memexport_range_count_ = 0;
-  if (memexport_used_vertex || memexport_used_pixel) {
-    bool retflag;
-    bool retval = GatherMemexportRangesAndMakeResident(retflag);
-    if (retflag) return retval;
+  memexport_ranges_.clear();
+  if (memexport_used_vertex) {
+    draw_util::AddMemExportRanges(regs, *vertex_shader, memexport_ranges_);
+  }
+  if (memexport_used_pixel) {
+    draw_util::AddMemExportRanges(regs, *pixel_shader, memexport_ranges_);
+  }
+  for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
+    if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
+                                      memexport_range.size_bytes)) {
+      XELOGE(
+          "Failed to request memexport stream at 0x{:08X} (size {}) in the "
+          "shared memory",
+          memexport_range.base_address_dwords << 2, memexport_range.size_bytes);
+      return false;
+    }
   }
   // Primitive topology.
   D3D_PRIMITIVE_TOPOLOGY primitive_topology;
@@ -2935,11 +2946,22 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
           // If the shared memory is a UAV, it can't be used as an index buffer
           // (UAV is a read/write state, index buffer is a read-only state).
           // Need to copy the indices to a buffer in the index buffer state.
-          bool retflag;
-          bool retval = HandleMemexportGuestDMA(
-              scratch_index_buffer, index_buffer_view,
-              primitive_processing_result.guest_index_base, retflag);
-          if (retflag) return retval;
+          scratch_index_buffer = RequestScratchGPUBuffer(
+              index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST);
+          if (scratch_index_buffer == nullptr) {
+            return false;
+          }
+          shared_memory_->UseAsCopySource();
+          SubmitBarriers();
+          deferred_command_list_.D3DCopyBufferRegion(
+              scratch_index_buffer, 0, shared_memory_->GetBuffer(),
+              primitive_processing_result.guest_index_base,
+              index_buffer_view.SizeInBytes);
+          PushTransitionBarrier(scratch_index_buffer,
+                                D3D12_RESOURCE_STATE_COPY_DEST,
+                                D3D12_RESOURCE_STATE_INDEX_BUFFER);
+          index_buffer_view.BufferLocation =
+              scratch_index_buffer->GetGPUVirtualAddress();
         } else {
           index_buffer_view.BufferLocation =
               shared_memory_->GetGPUAddress() +
@@ -2977,199 +2999,66 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   }
 
   if (memexport_used) {
-    HandleMemexportDrawOrdering_AndReadback();
-  }
-
-  return true;
-}
-XE_COLD
-XE_NOINLINE
-bool D3D12CommandProcessor::HandleMemexportGuestDMA(
-    ID3D12Resource*& scratch_index_buffer,
-    D3D12_INDEX_BUFFER_VIEW& index_buffer_view, uint32_t guest_index_base,
-    // xe::gpu::PrimitiveProcessor::ProcessingResult&
-    // primitive_processing_result,
-    bool& retflag) {
-  retflag = true;
-  scratch_index_buffer = RequestScratchGPUBuffer(
-      index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST);
-  if (scratch_index_buffer == nullptr) {
-    return false;
-  }
-  shared_memory_->UseAsCopySource();
-  SubmitBarriers();
-  deferred_command_list_.D3DCopyBufferRegion(
-      scratch_index_buffer, 0, shared_memory_->GetBuffer(), guest_index_base,
-      index_buffer_view.SizeInBytes);
-  PushTransitionBarrier(scratch_index_buffer, D3D12_RESOURCE_STATE_COPY_DEST,
-                        D3D12_RESOURCE_STATE_INDEX_BUFFER);
-  index_buffer_view.BufferLocation =
-      scratch_index_buffer->GetGPUVirtualAddress();
-  retflag = false;
-  return {};
-}
-XE_NOINLINE
-XE_COLD
-bool D3D12CommandProcessor::GatherMemexportRangesAndMakeResident(
-    bool& retflag) {
-  auto vertex_shader = static_cast<D3D12Shader*>(active_vertex_shader());
-  auto pixel_shader = static_cast<D3D12Shader*>(active_pixel_shader());
-  const xe::gpu::RegisterFile& regs = *register_file_;
-  const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
-  const bool memexport_used_pixel =
-      pixel_shader && pixel_shader->is_valid_memexport_used();
-  retflag = true;
-  if (memexport_used_vertex) {
-    for (uint32_t constant_index :
-         vertex_shader->memexport_stream_constants()) {
-      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
-          XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4);
-      if (memexport_stream.index_count == 0) {
-        continue;
-      }
-      uint32_t memexport_format_size =
-          GetSupportedMemExportFormatSize(memexport_stream.format);
-      if (memexport_format_size == 0) {
-        XELOGE("Unsupported memexport format {}",
-               FormatInfo::GetName(
-                   xenos::TextureFormat(uint32_t(memexport_stream.format))));
-        return false;
-      }
-      uint32_t memexport_size_dwords =
-          memexport_stream.index_count * memexport_format_size;
-      // Try to reduce the number of shared memory operations when writing
-      // different elements into the same buffer through different exports
-      // (happens in 4D5307E6).
-      bool memexport_range_reused = false;
-      for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-        MemExportRange& memexport_range = memexport_ranges_[i];
-        if (memexport_range.base_address_dwords ==
-            memexport_stream.base_address) {
-          memexport_range.size_dwords =
-              std::max(memexport_range.size_dwords, memexport_size_dwords);
-          memexport_range_reused = true;
-          break;
-        }
-      }
-      // Add a new range if haven't expanded an existing one.
-      if (!memexport_range_reused) {
-        MemExportRange& memexport_range =
-            memexport_ranges_[memexport_range_count_++];
-        memexport_range.base_address_dwords = memexport_stream.base_address;
-        memexport_range.size_dwords = memexport_size_dwords;
-      }
+    // Make sure this memexporting draw is ordered with other work using shared
+    // memory as a UAV.
+    // TODO(Triang3l): Find some PM4 command that can be used for indication of
+    // when memexports should be awaited?
+    shared_memory_->MarkUAVWritesCommitNeeded();
+    // Invalidate textures in memexported memory and watch for changes.
+    for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
+      shared_memory_->RangeWrittenByGpu(
+          memexport_range.base_address_dwords << 2, memexport_range.size_bytes,
+          false);
     }
-  }
-  if (memexport_used_pixel) {
-    for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) {
-      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
-          XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4);
-      if (memexport_stream.index_count == 0) {
-        continue;
+    if (cvars::d3d12_readback_memexport) {
+      // Read the exported data on the CPU.
+      uint32_t memexport_total_size = 0;
+      for (const draw_util::MemExportRange& memexport_range :
+           memexport_ranges_) {
+        memexport_total_size += memexport_range.size_bytes;
       }
-      uint32_t memexport_format_size =
-          GetSupportedMemExportFormatSize(memexport_stream.format);
-      if (memexport_format_size == 0) {
-        XELOGE("Unsupported memexport format {}",
-               FormatInfo::GetName(
-                   xenos::TextureFormat(uint32_t(memexport_stream.format))));
-        return false;
-      }
-      uint32_t memexport_size_dwords =
-          memexport_stream.index_count * memexport_format_size;
-      bool memexport_range_reused = false;
-      for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-        MemExportRange& memexport_range = memexport_ranges_[i];
-        if (memexport_range.base_address_dwords ==
-            memexport_stream.base_address) {
-          memexport_range.size_dwords =
-              std::max(memexport_range.size_dwords, memexport_size_dwords);
-          memexport_range_reused = true;
-          break;
-        }
-      }
-      if (!memexport_range_reused) {
-        MemExportRange& memexport_range =
-            memexport_ranges_[memexport_range_count_++];
-        memexport_range.base_address_dwords = memexport_stream.base_address;
-        memexport_range.size_dwords = memexport_size_dwords;
-      }
-    }
-  }
-  for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-    const MemExportRange& memexport_range = memexport_ranges_[i];
-    if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
-                                      memexport_range.size_dwords << 2)) {
-      XELOGE(
-          "Failed to request memexport stream at 0x{:08X} (size {}) in the "
-          "shared memory",
-          memexport_range.base_address_dwords << 2,
-          memexport_range.size_dwords << 2);
-      return false;
-    }
-  }
-  retflag = false;
-  return {};
-}
-XE_NOINLINE
-XE_COLD
-void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() {
-  // Make sure this memexporting draw is ordered with other work using shared
-  // memory as a UAV.
-  // TODO(Triang3l): Find some PM4 command that can be used for indication of
-  // when memexports should be awaited?
-  shared_memory_->MarkUAVWritesCommitNeeded();
-  // Invalidate textures in memexported memory and watch for changes.
-  for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-    const MemExportRange& memexport_range = memexport_ranges_[i];
-    shared_memory_->RangeWrittenByGpu(memexport_range.base_address_dwords << 2,
-                                      memexport_range.size_dwords << 2, false);
-  }
-  if (cvars::d3d12_readback_memexport) {
-    // Read the exported data on the CPU.
-    uint32_t memexport_total_size = 0;
-    for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-      memexport_total_size += memexport_ranges_[i].size_dwords << 2;
-    }
-    if (memexport_total_size != 0) {
-      ID3D12Resource* readback_buffer =
-          RequestReadbackBuffer(memexport_total_size);
-      if (readback_buffer != nullptr) {
-        shared_memory_->UseAsCopySource();
-        SubmitBarriers();
-        ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
-        uint32_t readback_buffer_offset = 0;
-        for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-          const MemExportRange& memexport_range = memexport_ranges_[i];
-          uint32_t memexport_range_size = memexport_range.size_dwords << 2;
-          deferred_command_list_.D3DCopyBufferRegion(
-              readback_buffer, readback_buffer_offset, shared_memory_buffer,
-              memexport_range.base_address_dwords << 2, memexport_range_size);
-          readback_buffer_offset += memexport_range_size;
-        }
-        if (AwaitAllQueueOperationsCompletion()) {
-          D3D12_RANGE readback_range;
-          readback_range.Begin = 0;
-          readback_range.End = memexport_total_size;
-          void* readback_mapping;
-          if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
-                                             &readback_mapping))) {
-            const uint32_t* readback_dwords =
-                reinterpret_cast<const uint32_t*>(readback_mapping);
-            for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-              const MemExportRange& memexport_range = memexport_ranges_[i];
-              std::memcpy(memory_->TranslatePhysical(
-                              memexport_range.base_address_dwords << 2),
-                          readback_dwords, memexport_range.size_dwords << 2);
-              readback_dwords += memexport_range.size_dwords;
+      if (memexport_total_size != 0) {
+        ID3D12Resource* readback_buffer =
+            RequestReadbackBuffer(memexport_total_size);
+        if (readback_buffer != nullptr) {
+          shared_memory_->UseAsCopySource();
+          SubmitBarriers();
+          ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
+          uint32_t readback_buffer_offset = 0;
+          for (const draw_util::MemExportRange& memexport_range :
+               memexport_ranges_) {
+            uint32_t memexport_range_size = memexport_range.size_bytes;
+            deferred_command_list_.D3DCopyBufferRegion(
+                readback_buffer, readback_buffer_offset, shared_memory_buffer,
+                memexport_range.base_address_dwords << 2, memexport_range_size);
+            readback_buffer_offset += memexport_range_size;
+          }
+          if (AwaitAllQueueOperationsCompletion()) {
+            D3D12_RANGE readback_range;
+            readback_range.Begin = 0;
+            readback_range.End = memexport_total_size;
+            void* readback_mapping;
+            if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
+                                               &readback_mapping))) {
+              const uint8_t* readback_bytes =
+                  reinterpret_cast<const uint8_t*>(readback_mapping);
+              for (const draw_util::MemExportRange& memexport_range :
+                   memexport_ranges_) {
+                std::memcpy(memory_->TranslatePhysical(
+                                memexport_range.base_address_dwords << 2),
+                            readback_bytes, memexport_range.size_bytes);
+                readback_bytes += memexport_range.size_bytes;
+              }
+              D3D12_RANGE readback_write_range = {};
+              readback_buffer->Unmap(0, &readback_write_range);
             }
-            D3D12_RANGE readback_write_range = {};
-            readback_buffer->Unmap(0, &readback_write_range);
           }
         }
       }
     }
   }
+
+  return true;
 }
 
 void D3D12CommandProcessor::InitializeTrace() {
@@ -5208,36 +5097,6 @@ bool D3D12CommandProcessor::UpdateBindings_BindfulPath(
   return {};
 }
 
-uint32_t D3D12CommandProcessor::GetSupportedMemExportFormatSize(
-    xenos::ColorFormat format) {
-  switch (format) {
-    case xenos::ColorFormat::k_8_8_8_8:
-    case xenos::ColorFormat::k_2_10_10_10:
-    // TODO(Triang3l): Investigate how k_8_8_8_8_A works - not supported in the
-    // texture cache currently.
-    // case xenos::ColorFormat::k_8_8_8_8_A:
-    case xenos::ColorFormat::k_10_11_11:
-    case xenos::ColorFormat::k_11_11_10:
-    case xenos::ColorFormat::k_16_16:
-    case xenos::ColorFormat::k_16_16_FLOAT:
-    case xenos::ColorFormat::k_32_FLOAT:
-    case xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16:
-    case xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16:
-    case xenos::ColorFormat::k_10_11_11_AS_16_16_16_16:
-    case xenos::ColorFormat::k_11_11_10_AS_16_16_16_16:
-      return 1;
-    case xenos::ColorFormat::k_16_16_16_16:
-    case xenos::ColorFormat::k_16_16_16_16_FLOAT:
-    case xenos::ColorFormat::k_32_32_FLOAT:
-      return 2;
-    case xenos::ColorFormat::k_32_32_32_32_FLOAT:
-      return 4;
-    default:
-      break;
-  }
-  return 0;
-}
-
 ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
   if (size == 0) {
     return nullptr;
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index c4dd454b3..46af23b99 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <utility>
+#include <vector>
 
 #include "xenia/base/assert.h"
 #include "xenia/gpu/command_processor.h"
@@ -319,18 +320,7 @@ class D3D12CommandProcessor final : public CommandProcessor {
   bool IssueDraw(xenos::PrimitiveType primitive_type, uint32_t index_count,
                  IndexBufferInfo* index_buffer_info,
                  bool major_mode_explicit) override;
-  XE_COLD
-  XE_NOINLINE
-  bool HandleMemexportGuestDMA(ID3D12Resource*& scratch_index_buffer,
-                               D3D12_INDEX_BUFFER_VIEW& index_buffer_view,
-                               uint32_t guest_index_base,
-                               bool& retflag);
-  XE_NOINLINE
-  XE_COLD
-  bool GatherMemexportRangesAndMakeResident(bool& retflag);
-  XE_NOINLINE
-  XE_COLD
-  void HandleMemexportDrawOrdering_AndReadback();
+
   bool IssueCopy() override;
   XE_NOINLINE
   bool IssueCopy_ReadbackResolvePath();
@@ -502,13 +492,6 @@ class D3D12CommandProcessor final : public CommandProcessor {
       const size_t sampler_count_vertex, const size_t sampler_count_pixel,
       bool& retflag);
 
-  // Returns dword count for one element for a memexport format, or 0 if it's
-  // not supported by the D3D12 command processor (if it's smaller that 1 dword,
-  // for instance).
-  // TODO(Triang3l): Check if any game uses memexport with formats smaller than
-  // 32 bits per element.
-  static uint32_t GetSupportedMemExportFormatSize(xenos::ColorFormat format);
-
   // Returns a buffer for reading GPU data back to the CPU. Assuming
   // synchronizing immediately after use. Always in COPY_DEST state.
   ID3D12Resource* RequestReadbackBuffer(uint32_t size);
@@ -811,12 +794,13 @@ class D3D12CommandProcessor final : public CommandProcessor {
 
   draw_util::GetViewportInfoArgs previous_viewport_info_args_;
   draw_util::ViewportInfo previous_viewport_info_;
-  // scratch memexport data
-  MemExportRange memexport_ranges_[512];
-  uint32_t memexport_range_count_ = 0;
+
 
   std::atomic<bool> pix_capture_requested_ = false;
   bool pix_capturing_;
+
+  // Temporary storage for memexport stream constants used in the draw.
+  std::vector<draw_util::MemExportRange> memexport_ranges_;
 };
 
 }  // namespace d3d12
diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc
index e6461e8bd..802997580 100644
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@@ -2,7 +2,7 @@
  ******************************************************************************
  * Xenia : Xbox 360 Emulator Research Project                                 *
  ******************************************************************************
- * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Copyright 2023 Ben Vanik. All rights reserved.                             *
  * Released under the BSD license - see LICENSE in the root for more details. *
  ******************************************************************************
  */
@@ -134,7 +134,7 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
   //
   // Memory export is an obvious intentional side effect.
   if (shader.kills_pixels() || shader.writes_depth() ||
-      shader.is_valid_memexport_used() ||
+      shader.memexport_eM_written() ||
       (shader.writes_color_target(0) &&
        DoesCoverageDependOnAlpha(regs.Get<reg::RB_COLORCONTROL>()))) {
     return true;
@@ -765,8 +765,70 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
   }
   return normalized_color_mask;
 }
+
+void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
+                        std::vector<MemExportRange>& ranges_out) {
+  if (!shader.memexport_eM_written()) {
+    // The shader has eA writes, but no real exports.
+    return;
+  }
+  uint32_t float_constants_base = shader.type() == xenos::ShaderType::kVertex
+                                      ? regs.Get<reg::SQ_VS_CONST>().base
+                                      : regs.Get<reg::SQ_PS_CONST>().base;
+  for (uint32_t constant_index : shader.memexport_stream_constants()) {
+    const auto& stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
+        XE_GPU_REG_SHADER_CONSTANT_000_X +
+        (float_constants_base + constant_index) * 4);
+    if (!stream.index_count) {
+      continue;
+    }
+    const FormatInfo& format_info =
+        *FormatInfo::Get(xenos::TextureFormat(stream.format));
+    if (format_info.type != FormatType::kResolvable) {
+      XELOGE("Unsupported memexport format {}",
+             FormatInfo::GetName(format_info.format));
+      // Translated shaders shouldn't be performing exports with an unknown
+      // format, the draw can still be performed.
+      continue;
+    }
+    // TODO(Triang3l): Remove the unresearched format logging when it's known
+    // how exactly these formats need to be handled (most importantly what
+    // components need to be stored and in which order).
+    switch (stream.format) {
+      case xenos::ColorFormat::k_8_A:
+      case xenos::ColorFormat::k_8_B:
+      case xenos::ColorFormat::k_8_8_8_8_A:
+        XELOGW(
+            "Memexport done to an unresearched format {}, report the game to "
+            "Xenia developers!",
+            FormatInfo::GetName(format_info.format));
+        break;
+      default:
+        break;
+    }
+    uint32_t stream_size_bytes =
+        stream.index_count * (format_info.bits_per_pixel >> 3);
+    // Try to reduce the number of shared memory operations when writing
+    // different elements into the same buffer through different exports
+    // (happens in 4D5307E6).
+    bool range_reused = false;
+    for (MemExportRange& range : ranges_out) {
+      if (range.base_address_dwords == stream.base_address) {
+        range.size_bytes = std::max(range.size_bytes, stream_size_bytes);
+        range_reused = true;
+        break;
+      }
+    }
+    // Add a new range if haven't expanded an existing one.
+    if (!range_reused) {
+      ranges_out.emplace_back(stream.base_address, stream_size_bytes);
+    }
+  }
+}
+
 XE_NOINLINE
 XE_NOALIAS
+
 xenos::CopySampleSelect SanitizeCopySampleSelect(
     xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
     bool is_depth) {
diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h
index 8196830b8..08c710e6c 100644
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@@ -13,6 +13,7 @@
 #include <cmath>
 #include <cstdint>
 #include <utility>
+#include <vector>
 
 #include "xenia/base/assert.h"
 #include "xenia/gpu/register_file.h"
@@ -474,6 +475,19 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA(
   return guest_sample_index ? 3 : 0;
 }
 
+struct MemExportRange {
+  uint32_t base_address_dwords;
+  uint32_t size_bytes;
+
+  explicit MemExportRange(uint32_t base_address_dwords, uint32_t size_bytes)
+      : base_address_dwords(base_address_dwords), size_bytes(size_bytes) {}
+};
+
+// Gathers memory ranges involved in memexports in the shader with the float
+// constants from the registers, adding them to ranges_out.
+void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
+                        std::vector<MemExportRange>& ranges_out);
+
 // To avoid passing values that the shader won't understand (even though
 // Direct3D 9 shouldn't pass them anyway).
 XE_NOINLINE
diff --git a/src/xenia/gpu/dxbc.h b/src/xenia/gpu/dxbc.h
index ea44abe46..e1587a7a5 100644
--- a/src/xenia/gpu/dxbc.h
+++ b/src/xenia/gpu/dxbc.h
@@ -913,6 +913,8 @@ enum class OperandModifier : uint32_t {
 
 struct Dest : OperandAddress {
   // Ignored for 0-component and 1-component operand types.
+  // For 4-component operand types, if the write mask is 0, it's treated as
+  // 0-component.
   uint32_t write_mask_;
 
   // Input destinations (v*) are for use only in declarations. Vector input
@@ -1028,12 +1030,16 @@ struct Dest : OperandAddress {
   void Write(std::vector<uint32_t>& code, bool in_dcl = false) const {
     uint32_t operand_token = GetOperandTokenTypeAndIndex();
     OperandDimension dimension = GetDimension(in_dcl);
-    operand_token |= uint32_t(dimension);
     if (dimension == OperandDimension::kVector) {
-      assert_true(write_mask_ > 0b0000 && write_mask_ <= 0b1111);
-      operand_token |=
-          (uint32_t(ComponentSelection::kMask) << 2) | (write_mask_ << 4);
+      if (write_mask_) {
+        assert_true(write_mask_ <= 0b1111);
+        operand_token |=
+            (uint32_t(ComponentSelection::kMask) << 2) | (write_mask_ << 4);
+      } else {
+        dimension = OperandDimension::kNoData;
+      }
     }
+    operand_token |= uint32_t(dimension);
     code.push_back(operand_token);
     OperandAddress::Write(code);
   }
@@ -1508,6 +1514,8 @@ enum class Opcode : uint32_t {
   kStoreUAVTyped = 164,
   kLdRaw = 165,
   kStoreRaw = 166,
+  kAtomicAnd = 169,
+  kAtomicOr = 170,
   kEvalSampleIndex = 204,
   kEvalCentroid = 205,
 };
@@ -2396,6 +2404,14 @@ class Assembler {
     ++stat_.instruction_count;
     ++stat_.c_texture_store_instructions;
   }
+  void OpAtomicAnd(const Dest& dest, const Src& address,
+                   uint32_t address_components, const Src& value) {
+    EmitAtomicOp(Opcode::kAtomicAnd, dest, address, address_components, value);
+  }
+  void OpAtomicOr(const Dest& dest, const Src& address,
+                  uint32_t address_components, const Src& value) {
+    EmitAtomicOp(Opcode::kAtomicOr, dest, address, address_components, value);
+  }
   void OpEvalSampleIndex(const Dest& dest, const Src& value,
                          const Src& sample_index) {
     uint32_t dest_write_mask = dest.GetMask();
@@ -2522,6 +2538,22 @@ class Assembler {
     src1.Write(code_, true, 0b0000);
     ++stat_.instruction_count;
   }
+  void EmitAtomicOp(Opcode opcode, const Dest& dest, const Src& address,
+                    uint32_t address_components, const Src& value) {
+    // Atomic operations require a 0-component memory destination.
+    assert_zero(dest.GetMask());
+    uint32_t address_mask = (1 << address_components) - 1;
+    uint32_t operands_length = dest.GetLength() +
+                               address.GetLength(address_mask) +
+                               value.GetLength(0b0001);
+    code_.reserve(code_.size() + 1 + operands_length);
+    code_.push_back(OpcodeToken(opcode, operands_length));
+    dest.Write(code_);
+    address.Write(code_, true, address_mask);
+    value.Write(code_, true, 0b0001);
+    ++stat_.instruction_count;
+    ++stat_.c_interlocked_instructions;
+  }
 
   std::vector<uint32_t>& code_;
   Statistics& stat_;
diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc
index 921d7e346..5edf920b8 100644
--- a/src/xenia/gpu/dxbc_shader_translator.cc
+++ b/src/xenia/gpu/dxbc_shader_translator.cc
@@ -179,8 +179,6 @@ void DxbcShaderTranslator::Reset() {
 
   sampler_bindings_.clear();
 
-  memexport_alloc_current_count_ = 0;
-
   std::memset(&shader_feature_info_, 0, sizeof(shader_feature_info_));
   std::memset(&statistics_, 0, sizeof(statistics_));
 }
@@ -789,6 +787,63 @@ void DxbcShaderTranslator::StartPixelShader() {
       PopSystemTemp();
     }
   }
+
+  if (current_shader().memexport_eM_written()) {
+    // Make sure memexport is done only once for a guest pixel.
+    dxbc::Dest memexport_enabled_dest(
+        dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001));
+    dxbc::Src memexport_enabled_src(dxbc::Src::R(
+        system_temp_memexport_enabled_and_eM_written_, dxbc::Src::kXXXX));
+    uint32_t resolution_scaled_axes =
+        uint32_t(draw_resolution_scale_x_ > 1) |
+        (uint32_t(draw_resolution_scale_y_ > 1) << 1);
+    if (resolution_scaled_axes) {
+      uint32_t memexport_condition_temp = PushSystemTemp();
+      // Only do memexport for one host pixel in a guest pixel - prefer the
+      // host pixel closer to the center of the guest pixel, but one that's
+      // covered with the half-pixel offset according to the top-left rule (1
+      // for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x
+      // because it's the center and is covered with the half-pixel offset too).
+      in_position_used_ |= resolution_scaled_axes;
+      a_.OpFToU(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
+                dxbc::Src::V1D(in_reg_ps_position_));
+      a_.OpUDiv(dxbc::Dest::Null(),
+                dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
+                dxbc::Src::R(memexport_condition_temp),
+                dxbc::Src::LU(draw_resolution_scale_x_,
+                              draw_resolution_scale_y_, 0, 0));
+      a_.OpIEq(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
+               dxbc::Src::R(memexport_condition_temp),
+               dxbc::Src::LU(draw_resolution_scale_x_ >> 1,
+                             draw_resolution_scale_y_ >> 1, 0, 0));
+      for (uint32_t i = 0; i < 2; ++i) {
+        if (!(resolution_scaled_axes & (1 << i))) {
+          continue;
+        }
+        a_.OpAnd(memexport_enabled_dest, memexport_enabled_src,
+                 dxbc::Src::R(memexport_condition_temp).Select(i));
+      }
+      // Release memexport_condition_temp.
+      PopSystemTemp();
+    }
+    // With sample-rate shading (with float24 conversion), only do memexport
+    // from one sample (as the shader is invoked multiple times for a pixel),
+    // if SV_SampleIndex == firstbit_lo(SV_Coverage). For zero coverage,
+    // firstbit_lo returns 0xFFFFFFFF.
+    if (IsSampleRate()) {
+      uint32_t memexport_condition_temp = PushSystemTemp();
+      a_.OpFirstBitLo(dxbc::Dest::R(memexport_condition_temp, 0b0001),
+                      dxbc::Src::VCoverage());
+      a_.OpIEq(
+          dxbc::Dest::R(memexport_condition_temp, 0b0001),
+          dxbc::Src::V1D(in_reg_ps_front_face_sample_index_, dxbc::Src::kYYYY),
+          dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX));
+      a_.OpAnd(memexport_enabled_dest, memexport_enabled_src,
+               dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX));
+      // Release memexport_condition_temp.
+      PopSystemTemp();
+    }
+  }
 }
 
 void DxbcShaderTranslator::StartTranslation() {
@@ -885,34 +940,27 @@ void DxbcShaderTranslator::StartTranslation() {
     }
   }
 
-  if (!is_depth_only_pixel_shader_) {
-    // Allocate temporary registers for memexport addresses and data.
-    std::memset(system_temps_memexport_address_, 0xFF,
-                sizeof(system_temps_memexport_address_));
-    std::memset(system_temps_memexport_data_, 0xFF,
-                sizeof(system_temps_memexport_data_));
-    system_temp_memexport_written_ = UINT32_MAX;
-    const uint8_t* memexports_written = current_shader().memexport_eM_written();
-    for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) {
-      uint32_t memexport_alloc_written = memexports_written[i];
-      if (memexport_alloc_written == 0) {
-        continue;
-      }
-      // If memexport is used at all, allocate a register containing whether eM#
-      // have actually been written to.
-      if (system_temp_memexport_written_ == UINT32_MAX) {
-        system_temp_memexport_written_ = PushSystemTemp(0b1111);
-      }
-      system_temps_memexport_address_[i] = PushSystemTemp(0b1111);
-      uint32_t memexport_data_index;
-      while (xe::bit_scan_forward(memexport_alloc_written,
-                                  &memexport_data_index)) {
-        memexport_alloc_written &= ~(1u << memexport_data_index);
-        system_temps_memexport_data_[i][memexport_data_index] =
-            PushSystemTemp();
-      }
+  // Allocate temporary registers for memexport.
+  uint8_t memexport_eM_written = current_shader().memexport_eM_written();
+  if (memexport_eM_written) {
+    system_temp_memexport_enabled_and_eM_written_ = PushSystemTemp(0b0010);
+    // Initialize the memexport conditional to whether the shared memory is
+    // currently bound as UAV (to 0 or UINT32_MAX). It can be made narrower
+    // later.
+    a_.OpIBFE(
+        dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001),
+        dxbc::Src::LU(1), dxbc::Src::LU(kSysFlag_SharedMemoryIsUAV_Shift),
+        LoadFlagsSystemConstant());
+    system_temp_memexport_address_ = PushSystemTemp(0b1111);
+    uint8_t memexport_eM_remaining = memexport_eM_written;
+    uint32_t memexport_eM_index;
+    while (xe::bit_scan_forward(memexport_eM_remaining, &memexport_eM_index)) {
+      memexport_eM_remaining &= ~(uint8_t(1) << memexport_eM_index);
+      system_temps_memexport_data_[memexport_eM_index] = PushSystemTemp(0b1111);
     }
+  }
 
+  if (!is_depth_only_pixel_shader_) {
     // Allocate system temporary variables for the translated code. Since access
     // depends on the guest code (thus no guarantees), initialize everything
     // now (except for pv, it's an internal temporary variable, not accessible
@@ -1091,27 +1139,19 @@ void DxbcShaderTranslator::CompleteShaderCode() {
     // - system_temp_grad_h_lod_.
     // - system_temp_grad_v_vfetch_address_.
     PopSystemTemp(6);
+  }
 
-    // Write memexported data to the shared memory UAV.
-    ExportToMemory();
+  uint8_t memexport_eM_written = current_shader().memexport_eM_written();
+  if (memexport_eM_written) {
+    // Write data for the last memexport.
+    ExportToMemory(
+        current_shader().memexport_eM_potentially_written_before_end());
 
-    // Release memexport temporary registers.
-    for (int i = Shader::kMaxMemExports - 1; i >= 0; --i) {
-      if (system_temps_memexport_address_[i] == UINT32_MAX) {
-        continue;
-      }
-      // Release exported data registers.
-      for (int j = 4; j >= 0; --j) {
-        if (system_temps_memexport_data_[i][j] != UINT32_MAX) {
-          PopSystemTemp();
-        }
-      }
-      // Release the address register.
-      PopSystemTemp();
-    }
-    if (system_temp_memexport_written_ != UINT32_MAX) {
-      PopSystemTemp();
-    }
+    // Release memexport temporary registers:
+    // - system_temp_memexport_enabled_and_eM_written_.
+    // - system_temp_memexport_address_.
+    // - system_temps_memexport_data_.
+    PopSystemTemp(xe::bit_count(uint32_t(memexport_eM_written)) + 2);
   }
 
   // Write stage-specific epilogue.
@@ -1514,36 +1554,22 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
       dest = dxbc::Dest::R(system_temp_point_size_edge_flag_kill_vertex_);
       break;
     case InstructionStorageTarget::kExportAddress:
-      // Validate memexport writes (4D5307E6 has some completely invalid ones).
-      if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 ||
-          memexport_alloc_current_count_ > Shader::kMaxMemExports ||
-          system_temps_memexport_address_[memexport_alloc_current_count_ - 1] ==
-              UINT32_MAX) {
+      if (!current_shader().memexport_eM_written()) {
         return;
       }
-      dest = dxbc::Dest::R(
-          system_temps_memexport_address_[memexport_alloc_current_count_ - 1]);
+      dest = dxbc::Dest::R(system_temp_memexport_address_);
       break;
     case InstructionStorageTarget::kExportData: {
-      // Validate memexport writes (4D5307E6 has some completely invalid ones).
-      if (memexport_alloc_current_count_ == 0 ||
-          memexport_alloc_current_count_ > Shader::kMaxMemExports ||
-          system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
-                                      [result.storage_index] == UINT32_MAX) {
-        return;
-      }
-      dest = dxbc::Dest::R(
-          system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
-                                      [result.storage_index]);
+      assert_not_zero(current_shader().memexport_eM_written() &
+                      (uint8_t(1) << result.storage_index));
+      dest = dxbc::Dest::R(system_temps_memexport_data_[result.storage_index]);
       // Mark that the eM# has been written to and needs to be exported.
       assert_not_zero(used_write_mask);
-      uint32_t memexport_index = memexport_alloc_current_count_ - 1;
-      a_.OpOr(dxbc::Dest::R(system_temp_memexport_written_,
-                            1 << (memexport_index >> 2)),
-              dxbc::Src::R(system_temp_memexport_written_)
-                  .Select(memexport_index >> 2),
-              dxbc::Src::LU(uint32_t(1) << (result.storage_index +
-                                            ((memexport_index & 3) << 3))));
+      a_.OpOr(
+          dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010),
+          dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_,
+                       dxbc::Src::kYYYY),
+          dxbc::Src::LU(uint8_t(1) << result.storage_index));
     } break;
     case InstructionStorageTarget::kColor:
       assert_not_zero(used_write_mask);
@@ -1990,15 +2016,38 @@ void DxbcShaderTranslator::ProcessJumpInstruction(
 }
 
 void DxbcShaderTranslator::ProcessAllocInstruction(
-    const ParsedAllocInstruction& instr) {
+    const ParsedAllocInstruction& instr, uint8_t export_eM) {
+  bool start_memexport = instr.type == AllocType::kMemory &&
+                         current_shader().memexport_eM_written();
+  if (export_eM || start_memexport) {
+    CloseExecConditionals();
+  }
+
   if (emit_source_map_) {
     instruction_disassembly_buffer_.Reset();
     instr.Disassemble(&instruction_disassembly_buffer_);
     EmitInstructionDisassembly();
   }
 
-  if (instr.type == AllocType::kMemory) {
-    ++memexport_alloc_current_count_;
+  if (export_eM) {
+    ExportToMemory(export_eM);
+    // Reset which eM# elements have been written.
+    a_.OpMov(
+        dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010),
+        dxbc::Src::LU(0));
+    // Break dependencies from the previous memexport.
+    uint8_t export_eM_remaining = export_eM;
+    uint32_t eM_index;
+    while (xe::bit_scan_forward(export_eM_remaining, &eM_index)) {
+      export_eM_remaining &= ~(uint8_t(1) << eM_index);
+      a_.OpMov(dxbc::Dest::R(system_temps_memexport_data_[eM_index]),
+               dxbc::Src::LF(0.0f));
+    }
+  }
+
+  if (start_memexport) {
+    // Initialize eA to an invalid address.
+    a_.OpMov(dxbc::Dest::R(system_temp_memexport_address_), dxbc::Src::LU(0));
   }
 }
 
@@ -2851,7 +2900,7 @@ void DxbcShaderTranslator::WriteInputSignature() {
     // Sample index (SV_SampleIndex) for safe memexport with sample-rate
     // shading.
     size_t sample_index_position = SIZE_MAX;
-    if (current_shader().is_valid_memexport_used() && IsSampleRate()) {
+    if (current_shader().memexport_eM_written() && IsSampleRate()) {
       size_t sample_index_position = shader_object_.size();
       shader_object_.resize(shader_object_.size() + kParameterDwords);
       ++parameter_count;
@@ -3625,7 +3674,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
           dxbc::Name::kPosition);
     }
     bool sample_rate_memexport =
-        current_shader().is_valid_memexport_used() && IsSampleRate();
+        current_shader().memexport_eM_written() && IsSampleRate();
     // Sample-rate shading can't be done with UAV-only rendering (sample-rate
     // shading is only needed for float24 depth conversion when using a float32
     // host depth buffer).
diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h
index bcb38a21f..20fbdd328 100644
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
@@ -20,6 +20,7 @@
 #include "xenia/base/string_buffer.h"
 #include "xenia/gpu/dxbc.h"
 #include "xenia/gpu/shader_translator.h"
+#include "xenia/gpu/ucode.h"
 #include "xenia/ui/graphics_provider.h"
 
 namespace xe {
@@ -589,13 +590,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
   void ProcessLoopEndInstruction(
       const ParsedLoopEndInstruction& instr) override;
   void ProcessJumpInstruction(const ParsedJumpInstruction& instr) override;
-  void ProcessAllocInstruction(const ParsedAllocInstruction& instr) override;
+  void ProcessAllocInstruction(const ParsedAllocInstruction& instr,
+                               uint8_t export_eM) override;
 
   void ProcessVertexFetchInstruction(
       const ParsedVertexFetchInstruction& instr) override;
   void ProcessTextureFetchInstruction(
       const ParsedTextureFetchInstruction& instr) override;
-  void ProcessAluInstruction(const ParsedAluInstruction& instr) override;
+  void ProcessAluInstruction(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before) override;
 
  private:
   // IF ANY OF THESE ARE CHANGED, WriteInputSignature and WriteOutputSignature
@@ -674,6 +678,11 @@ class DxbcShaderTranslator : public ShaderTranslator {
   // Frees the last allocated internal r# registers for later reuse.
   void PopSystemTemp(uint32_t count = 1);
 
+  // ExportToMemory modifies the values of eA/eM# for simplicity, call only
+  // before starting a new export or ending the invocation or making it
+  // inactive.
+  void ExportToMemory(uint8_t export_eM);
+
   // Converts one scalar from piecewise linear gamma to linear. The target may
   // be the same as the source, the temporary variables must be different. If
   // the source is not pre-saturated, saturation will be done internally.
@@ -728,7 +737,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
   bool ROV_IsDepthStencilEarly() const {
     assert_true(edram_rov_used_);
     return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() &&
-           !current_shader().is_valid_memexport_used();
+           !current_shader().memexport_eM_written();
   }
   // Converts the pre-clamped depth value to 24-bit (storing the result in bits
   // 0:23 and zeros in 24:31, not creating room for stencil - since this may be
@@ -787,14 +796,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
   void StartPixelShader_LoadROVParameters();
   void StartPixelShader();
 
-  // Writing the epilogue.
-  // ExportToMemory modifies the values of eA/eM# for simplicity, don't call
-  // multiple times.
-  void ExportToMemory_PackFixed32(const uint32_t* eM_temps, uint32_t eM_count,
-                                  const uint32_t bits[4],
-                                  const dxbc::Src& is_integer,
-                                  const dxbc::Src& is_signed);
-  void ExportToMemory();
   void CompleteVertexOrDomainShader();
   // For RTV, adds the sample to coverage_temp.coverage_temp_component if it
   // passes alpha to mask (or, if initialize == true (for the first sample
@@ -917,13 +918,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
         .SelectFromSwizzled(word_index & 1);
   }
 
-  void KillPixel(bool condition, const dxbc::Src& condition_src);
+  void KillPixel(bool condition, const dxbc::Src& condition_src,
+                 uint8_t memexport_eM_potentially_written_before);
 
-  void ProcessVectorAluOperation(const ParsedAluInstruction& instr,
-                                 uint32_t& result_swizzle,
-                                 bool& predicate_written);
-  void ProcessScalarAluOperation(const ParsedAluInstruction& instr,
-                                 bool& predicate_written);
+  void ProcessVectorAluOperation(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle,
+      bool& predicate_written);
+  void ProcessScalarAluOperation(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before, bool& predicate_written);
 
   void WriteResourceDefinition();
   void WriteInputSignature();
@@ -1124,14 +1128,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
   // writing).
   uint32_t system_temps_color_[4];
 
-  // Bits containing whether each eM# has been written, for up to 16 streams, or
-  // UINT32_MAX if memexport is not used. 8 bits (5 used) for each stream, with
-  // 4 `alloc export`s per component.
-  uint32_t system_temp_memexport_written_;
-  // eA in each `alloc export`, or UINT32_MAX if not used.
-  uint32_t system_temps_memexport_address_[Shader::kMaxMemExports];
-  // eM# in each `alloc export`, or UINT32_MAX if not used.
-  uint32_t system_temps_memexport_data_[Shader::kMaxMemExports][5];
+  // Memory export temporary registers are allocated if the shader writes any
+  // eM# (current_shader().memexport_eM_written() != 0).
+  // X - whether memexport is enabled for this invocation.
+  // Y - which eM# elements have been written so far by the invocation since the
+  //     last memory write.
+  uint32_t system_temp_memexport_enabled_and_eM_written_;
+  // eA.
+  uint32_t system_temp_memexport_address_;
+  // eM#.
+  uint32_t system_temps_memexport_data_[ucode::kMaxMemExportElementCount];
 
   // Vector ALU or fetch result / scratch (since Xenos write masks can contain
   // swizzles).
@@ -1195,10 +1201,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
   uint32_t uav_index_edram_;
 
   std::vector<SamplerBinding> sampler_bindings_;
-
-  // Number of `alloc export`s encountered so far in the translation. The index
-  // of the current eA/eM# temp register set is this minus 1, if it's not 0.
-  uint32_t memexport_alloc_current_count_;
 };
 
 }  // namespace gpu
diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc
index 948406b90..a1d2970f0 100644
--- a/src/xenia/gpu/dxbc_shader_translator_alu.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc
@@ -19,22 +19,29 @@ namespace xe {
 namespace gpu {
 using namespace ucode;
 
-void DxbcShaderTranslator::KillPixel(bool condition,
-                                     const dxbc::Src& condition_src) {
+void DxbcShaderTranslator::KillPixel(
+    bool condition, const dxbc::Src& condition_src,
+    uint8_t memexport_eM_potentially_written_before) {
+  a_.OpIf(condition, condition_src);
+  // Perform outstanding memory exports before the invocation becomes inactive
+  // and UAV writes are disabled.
+  ExportToMemory(memexport_eM_potentially_written_before);
   // Discard the pixel, but continue execution if other lanes in the quad need
   // this lane for derivatives. The driver may also perform early exiting
   // internally if all lanes are discarded if deemed beneficial.
-  a_.OpDiscard(condition, condition_src);
+  a_.OpDiscard(true, dxbc::Src::LU(UINT32_MAX));
   if (edram_rov_used_) {
     // Even though discarding disables all subsequent UAV/ROV writes, also skip
     // as much of the Render Backend emulation logic as possible by setting the
     // coverage and the mask of the written render targets to zero.
     a_.OpMov(dxbc::Dest::R(system_temp_rov_params_, 0b0001), dxbc::Src::LU(0));
   }
+  a_.OpEndIf();
 }
 
 void DxbcShaderTranslator::ProcessVectorAluOperation(
-    const ParsedAluInstruction& instr, uint32_t& result_swizzle,
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle,
     bool& predicate_written) {
   result_swizzle = dxbc::Src::kXYZW;
   predicate_written = false;
@@ -506,7 +513,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
       a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
       if (used_result_components) {
         a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                  dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@@ -522,7 +530,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
       a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
       if (used_result_components) {
         a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                  dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@@ -538,7 +547,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
       a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
       if (used_result_components) {
         a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                  dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@@ -554,7 +564,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
       a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
       if (used_result_components) {
         a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                  dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@@ -640,7 +651,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
 }
 
 void DxbcShaderTranslator::ProcessScalarAluOperation(
-    const ParsedAluInstruction& instr, bool& predicate_written) {
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before, bool& predicate_written) {
   predicate_written = false;
 
   if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev) {
@@ -950,27 +962,27 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
 
     case AluScalarOpcode::kKillsEq:
       a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
       a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
       break;
     case AluScalarOpcode::kKillsGt:
       a_.OpLT(ps_dest, dxbc::Src::LF(0.0f), operand_0_a);
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
       a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
       break;
     case AluScalarOpcode::kKillsGe:
       a_.OpGE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
       a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
       break;
     case AluScalarOpcode::kKillsNe:
       a_.OpNE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
       a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
       break;
     case AluScalarOpcode::kKillsOne:
       a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(1.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
       a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
       break;
 
@@ -1024,7 +1036,8 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
 }
 
 void DxbcShaderTranslator::ProcessAluInstruction(
-    const ParsedAluInstruction& instr) {
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before) {
   if (instr.IsNop()) {
     // Don't even disassemble or update predication.
     return;
@@ -1041,10 +1054,11 @@ void DxbcShaderTranslator::ProcessAluInstruction(
   // checked again later.
   bool predicate_written_vector = false;
   uint32_t vector_result_swizzle = dxbc::Src::kXYZW;
-  ProcessVectorAluOperation(instr, vector_result_swizzle,
-                            predicate_written_vector);
+  ProcessVectorAluOperation(instr, memexport_eM_potentially_written_before,
+                            vector_result_swizzle, predicate_written_vector);
   bool predicate_written_scalar = false;
-  ProcessScalarAluOperation(instr, predicate_written_scalar);
+  ProcessScalarAluOperation(instr, memexport_eM_potentially_written_before,
+                            predicate_written_scalar);
 
   StoreResult(instr.vector_and_constant_result,
               dxbc::Src::R(system_temp_result_, vector_result_swizzle),
diff --git a/src/xenia/gpu/dxbc_shader_translator_memexport.cc b/src/xenia/gpu/dxbc_shader_translator_memexport.cc
index c48facc08..1049fa739 100644
--- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc
@@ -2,533 +2,830 @@
  ******************************************************************************
  * Xenia : Xbox 360 Emulator Research Project                                 *
  ******************************************************************************
- * Copyright 2018 Ben Vanik. All rights reserved.                             *
+ * Copyright 2023 Ben Vanik. All rights reserved.                             *
  * Released under the BSD license - see LICENSE in the root for more details. *
  ******************************************************************************
  */
 
+#include <array>
+#include <cstdint>
+#include <functional>
+
 #include "xenia/base/assert.h"
 #include "xenia/base/math.h"
-#include "xenia/gpu/draw_util.h"
 #include "xenia/gpu/dxbc_shader_translator.h"
-#include "xenia/gpu/texture_cache.h"
 
 namespace xe {
 namespace gpu {
 using namespace ucode;
 
-// TODO(Triang3l): Support sub-dword memexports (like k_8 in 58410B86). This
-// would require four 128 MB R8_UINT UAVs due to
-// D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP. Need to be careful with
-// resource binding tiers, however. Resource binding tier 1 on feature level
-// 11_0 allows only 8 UAVs _across all stages_. RWByteAddressBuffer + 4 typed
-// buffers is 5 per stage already, would need 10 for both VS and PS, or even 11
-// with the eDRAM ROV. Need to drop draw commands doing memexport in both VS and
-// PS on FL 11_0 resource binding tier 1.
-
-void DxbcShaderTranslator::ExportToMemory_PackFixed32(
-    const uint32_t* eM_temps, uint32_t eM_count, const uint32_t bits[4],
-    const dxbc::Src& is_integer, const dxbc::Src& is_signed) {
-  // Will insert with BFI - sign extension of red will be overwritten, not
-  // truncated.
-  assert_not_zero(bits[0]);
-  assert_true(bits[0] + bits[1] + bits[2] + bits[3] == 32);
-  uint32_t mask = 0;
-  for (uint32_t i = 0; i < 4; ++i) {
-    if (bits[i]) {
-      mask |= 1 << i;
-    }
-  }
-  a_.OpIf(true, is_signed);
-  {
-    float range[4];
-    for (uint32_t i = 0; i < 4; ++i) {
-      range[i] = bits[i] ? float((uint32_t(1) << (bits[i] - 1)) - 1) : 0.0f;
-    }
-    dxbc::Src range_src(dxbc::Src::LP(range));
-    a_.OpIf(false, is_integer);
-    for (uint32_t i = 0; i < eM_count; ++i) {
-      uint32_t eM_temp = eM_temps[i];
-      a_.OpMul(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp), range_src);
-    }
-    a_.OpEndIf();
-    for (uint32_t i = 0; i < eM_count; ++i) {
-      dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[i], mask));
-      dxbc::Src eM_src(dxbc::Src::R(eM_temps[i]));
-      // TODO(Triang3l): NaN should become zero, not -range.
-      a_.OpMax(eM_dest, eM_src, -range_src);
-      a_.OpMin(eM_dest, eM_src, range_src);
-    }
-  }
-  a_.OpElse();
-  {
-    float range[4];
-    for (uint32_t i = 0; i < 4; ++i) {
-      range[i] = float((uint32_t(1) << bits[i]) - 1);
-    }
-    dxbc::Src range_src(dxbc::Src::LP(range));
-    a_.OpIf(false, is_integer);
-    for (uint32_t i = 0; i < eM_count; ++i) {
-      uint32_t eM_temp = eM_temps[i];
-      a_.OpMul(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp), range_src);
-    }
-    a_.OpEndIf();
-    for (uint32_t i = 0; i < eM_count; ++i) {
-      dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[i], mask));
-      dxbc::Src eM_src(dxbc::Src::R(eM_temps[i]));
-      a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(0.0f));
-      a_.OpMin(eM_dest, eM_src, range_src);
-    }
-  }
-  a_.OpEndIf();
-  for (uint32_t i = 0; i < eM_count; ++i) {
-    uint32_t eM_temp = eM_temps[i];
-    // Round to the nearest integer, according to the rules of handling integer
-    // formats in Direct3D.
-    // TODO(Triang3l): Round by adding +-0.5, not with round_ne.
-    a_.OpRoundNE(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp));
-    a_.OpFToI(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp));
-    dxbc::Dest eM_packed_dest(dxbc::Dest::R(eM_temp, 0b0001));
-    dxbc::Src eM_packed_src(dxbc::Src::R(eM_temp, dxbc::Src::kXXXX));
-    uint32_t offset = bits[0];
-    for (uint32_t j = 1; j < 4; ++j) {
-      if (!bits[j]) {
-        continue;
-      }
-      a_.OpBFI(eM_packed_dest, dxbc::Src::LU(bits[j]), dxbc::Src::LU(offset),
-               dxbc::Src::R(eM_temp).Select(j), eM_packed_src);
-      offset += bits[j];
-    }
-  }
-}
-
-void DxbcShaderTranslator::ExportToMemory() {
-  if (system_temp_memexport_written_ == UINT32_MAX) {
-    // No exports in the shader.
+void DxbcShaderTranslator::ExportToMemory(uint8_t export_eM) {
+  if (!export_eM) {
     return;
   }
 
-  // Allocate a register for temporary values at various stages.
-  uint32_t control_temp = PushSystemTemp();
+  assert_zero(export_eM & ~current_shader().memexport_eM_written());
 
-  // Safety check if the shared memory is bound as UAV.
-  a_.OpUBFE(dxbc::Dest::R(control_temp, 0b0001), dxbc::Src::LU(1),
-            dxbc::Src::LU(kSysFlag_SharedMemoryIsUAV_Shift),
-            LoadFlagsSystemConstant());
-  // Open the `if` with the uniform condition for the shared memory buffer being
-  // bound as a UAV (more fine-grained checks are vector and likely divergent).
-  a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
+  // Check if memory export is allowed in this invocation.
+  a_.OpIf(true, dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_,
+                             dxbc::Src::kXXXX));
 
-  // Check more fine-grained limitations.
-  bool inner_condition_provided = false;
-  if (is_pixel_shader()) {
-    uint32_t resolution_scaled_axes =
-        uint32_t(draw_resolution_scale_x_ > 1) |
-        (uint32_t(draw_resolution_scale_y_ > 1) << 1);
-    if (resolution_scaled_axes) {
-      // Only do memexport for one host pixel in a guest pixel - prefer the
-      // host pixel closer to the center of the guest pixel, but one that's
-      // covered with the half-pixel offset according to the top-left rule (1
-      // for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x
-      // because it's the center and is covered with the half-pixel offset too).
-      // Using control_temp.yz as per-axis temporary variables.
-      in_position_used_ |= resolution_scaled_axes;
-      a_.OpFToU(dxbc::Dest::R(control_temp, resolution_scaled_axes << 1),
-                dxbc::Src::V1D(in_reg_ps_position_, 0b0100 << 2));
-      a_.OpUDiv(dxbc::Dest::Null(),
-                dxbc::Dest::R(control_temp, resolution_scaled_axes << 1),
-                dxbc::Src::R(control_temp, 0b1001 << 2),
-                dxbc::Src::LU(0, draw_resolution_scale_x_,
-                              draw_resolution_scale_y_, 0));
-      for (uint32_t i = 0; i < 2; ++i) {
-        if (!(resolution_scaled_axes & (1 << i))) {
-          continue;
-        }
-        // If there's no inner condition in control_temp.x yet, the condition
-        // for the current axis can go directly to it. Otherwise, need to merge
-        // with the previous condition, using control_temp.y or .z as an
-        // intermediate variable.
-        dxbc::Src resolution_scaled_axis_src(
-            dxbc::Src::R(control_temp).Select(1 + i));
-        a_.OpIEq(
-            dxbc::Dest::R(control_temp,
-                          inner_condition_provided ? 1 << (1 + i) : 0b0001),
-            resolution_scaled_axis_src,
-            dxbc::Src::LU(
-                (i ? draw_resolution_scale_y_ : draw_resolution_scale_x_) >>
-                1));
-        if (inner_condition_provided) {
-          // Merge with the previous condition in control_temp.x.
-          a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
-                   dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
-                   resolution_scaled_axis_src);
-        }
-        inner_condition_provided = true;
-      }
-    }
-    // With sample-rate shading (with float24 conversion), only do memexport
-    // from one sample (as the shader is invoked multiple times for a pixel),
-    // if SV_SampleIndex == firstbit_lo(SV_Coverage). For zero coverage,
-    // firstbit_lo returns 0xFFFFFFFF.
-    if (IsSampleRate()) {
-      a_.OpFirstBitLo(dxbc::Dest::R(control_temp, 0b0010),
-                      dxbc::Src::VCoverage());
-      a_.OpIEq(
-          dxbc::Dest::R(control_temp,
-                        inner_condition_provided ? 0b0010 : 0b0001),
-          dxbc::Src::V1D(in_reg_ps_front_face_sample_index_, dxbc::Src::kYYYY),
-          dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
-      if (inner_condition_provided) {
-        // Merge with the previous condition in control_temp.x.
-        a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
-                 dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
-                 dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
-      }
-      inner_condition_provided = true;
-    }
-  }
-  // Open the inner (vector) conditional if needed.
-  if (inner_condition_provided) {
-    a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
-  }
-  // control_temp.x is now free.
-
-  for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) {
-    uint32_t eA_temp = system_temps_memexport_address_[i];
-    if (eA_temp == UINT32_MAX) {
-      // Export not used.
-      continue;
-    }
-    // For simplicity of access, gather actually used eM# registers for this
-    // export. Zero-initialize eM_offsets because excess elements of it may be
-    // accessed, for stable caching.
-    uint32_t eM_temps[5], eM_offsets[5] = {}, eM_count = 0;
-    for (uint32_t j = 0; j < 5; ++j) {
-      uint32_t eM_temp = system_temps_memexport_data_[i][j];
-      if (eM_temp == UINT32_MAX) {
-        continue;
-      }
-      eM_temps[eM_count] = eM_temp;
-      eM_offsets[eM_count] = j;
-      ++eM_count;
-    }
-    if (eM_count == 0) {
-      continue;
-    }
-
-    // Swap red and blue if needed.
-    a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
-             dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ),
-             dxbc::Src::LU(uint32_t(1) << 19));
-    for (uint32_t j = 0; j < eM_count; ++j) {
-      uint32_t eM_temp = eM_temps[j];
-      a_.OpMovC(dxbc::Dest::R(eM_temp, 0b0101),
-                dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
-                dxbc::Src::R(eM_temp, 0b000010), dxbc::Src::R(eM_temp));
-    }
-
-    // Initialize element size in control_temp.x to 4 bytes as this is the most
-    // common size.
-    dxbc::Dest element_size_dest(dxbc::Dest::R(control_temp, 0b0001));
-    dxbc::Src element_size_src(dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
-    a_.OpMov(element_size_dest, dxbc::Src::LU(4));
-
-    // Each eM should get a packed value in the destination format now.
-
-    // Extract format properties to control_temp.
-    // Y - signedness if fixed-point.
-    // Z - fractional/integer if fixed-point.
-    // W - color format.
-    a_.OpUBFE(dxbc::Dest::R(control_temp, 0b1110), dxbc::Src::LU(0, 1, 1, 6),
-              dxbc::Src::LU(0, 16, 17, 8),
-              dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ));
-    dxbc::Src is_signed(dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
-    dxbc::Src is_integer(dxbc::Src::R(control_temp, dxbc::Src::kZZZZ));
-    // Convert and pack the format.
-    a_.OpSwitch(dxbc::Src::R(control_temp, dxbc::Src::kWWWW));
-    // control_temp.w is now free.
-    {
-      // k_8_8_8_8
-      // k_8_8_8_8_AS_16_16_16_16
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8)));
-      a_.OpCase(dxbc::Src::LU(
-          uint32_t(xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16)));
-      {
-        uint32_t bits[4] = {8, 8, 8, 8};
-        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
-                                   is_signed);
-      }
-      a_.OpBreak();
-
-      // k_2_10_10_10
-      // k_2_10_10_10_AS_16_16_16_16
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_2_10_10_10)));
-      a_.OpCase(dxbc::Src::LU(
-          uint32_t(xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16)));
-      {
-        uint32_t bits[4] = {10, 10, 10, 2};
-        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
-                                   is_signed);
-      }
-      a_.OpBreak();
-
-      // k_10_11_11
-      // k_10_11_11_AS_16_16_16_16
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_10_11_11)));
-      a_.OpCase(dxbc::Src::LU(
-          uint32_t(xenos::ColorFormat::k_10_11_11_AS_16_16_16_16)));
-      {
-        uint32_t bits[4] = {11, 11, 10};
-        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
-                                   is_signed);
-      }
-      a_.OpBreak();
-
-      // k_11_11_10
-      // k_11_11_10_AS_16_16_16_16
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_11_11_10)));
-      a_.OpCase(dxbc::Src::LU(
-          uint32_t(xenos::ColorFormat::k_11_11_10_AS_16_16_16_16)));
-      {
-        uint32_t bits[4] = {10, 11, 11};
-        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
-                                   is_signed);
-      }
-      a_.OpBreak();
-
-      // k_16_16
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16)));
-      {
-        uint32_t bits[4] = {16, 16};
-        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
-                                   is_signed);
-      }
-      a_.OpBreak();
-
-      // k_16_16_16_16
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16)));
-      a_.OpMov(element_size_dest, dxbc::Src::LU(8));
-      a_.OpIf(true, is_signed);
-      {
-        a_.OpIf(false, is_integer);
-        for (uint32_t j = 0; j < eM_count; ++j) {
-          uint32_t eM_temp = eM_temps[j];
-          a_.OpMul(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp),
-                   dxbc::Src::LF(32767.0f));
-        }
-        a_.OpEndIf();
-        for (uint32_t j = 0; j < eM_count; ++j) {
-          dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
-          dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
-          // TODO(Triang3l): NaN should become zero, not -range.
-          a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(-32767.0f));
-          a_.OpMin(eM_dest, eM_src, dxbc::Src::LF(32767.0f));
-        }
-      }
-      a_.OpElse();
-      {
-        a_.OpIf(false, is_integer);
-        for (uint32_t j = 0; j < eM_count; ++j) {
-          uint32_t eM_temp = eM_temps[j];
-          a_.OpMul(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp),
-                   dxbc::Src::LF(65535.0f));
-        }
-        a_.OpEndIf();
-        for (uint32_t j = 0; j < eM_count; ++j) {
-          dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
-          dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
-          a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(0.0f));
-          a_.OpMin(eM_dest, eM_src, dxbc::Src::LF(65535.0f));
-        }
-      }
-      a_.OpEndIf();
-      for (uint32_t j = 0; j < eM_count; ++j) {
-        uint32_t eM_temp = eM_temps[j];
-        // Round to the nearest integer, according to the rules of handling
-        // integer formats in Direct3D.
-        // TODO(Triang3l): Round by adding +-0.5, not with round_ne.
-        a_.OpRoundNE(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp));
-        a_.OpFToI(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp));
-        a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::LU(16),
-                 dxbc::Src::LU(16), dxbc::Src::R(eM_temp, 0b1101),
-                 dxbc::Src::R(eM_temp, 0b1000));
-      }
-      a_.OpBreak();
-
-      // k_16_16_FLOAT
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_FLOAT)));
-      for (uint32_t j = 0; j < eM_count; ++j) {
-        uint32_t eM_temp = eM_temps[j];
-        a_.OpF32ToF16(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::R(eM_temp));
-        a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0001), dxbc::Src::LU(16),
-                 dxbc::Src::LU(16), dxbc::Src::R(eM_temp, dxbc::Src::kYYYY),
-                 dxbc::Src::R(eM_temp, dxbc::Src::kXXXX));
-      }
-      a_.OpBreak();
-
-      // k_16_16_16_16_FLOAT
-      a_.OpCase(
-          dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16_FLOAT)));
-      a_.OpMov(element_size_dest, dxbc::Src::LU(8));
-      for (uint32_t j = 0; j < eM_count; ++j) {
-        uint32_t eM_temp = eM_temps[j];
-        a_.OpF32ToF16(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp));
-        a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::LU(16),
-                 dxbc::Src::LU(16), dxbc::Src::R(eM_temp, 0b1101),
-                 dxbc::Src::R(eM_temp, 0b1000));
-      }
-      a_.OpBreak();
-
-      // k_32_FLOAT
-      // Already in the destination format, 4 bytes per element already
-      // selected.
-
-      // k_32_32_FLOAT
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_FLOAT)));
-      a_.OpMov(element_size_dest, dxbc::Src::LU(8));
-      // Already in the destination format.
-      a_.OpBreak();
-
-      // k_32_32_32_32_FLOAT
-      a_.OpCase(
-          dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_32_32_FLOAT)));
-      a_.OpMov(element_size_dest, dxbc::Src::LU(16));
-      // Already in the destination format.
-      a_.OpBreak();
-    }
-    a_.OpEndSwitch();
-    // control_temp.yz are now free.
-
-    // Do endian swap.
-    {
-      dxbc::Dest endian_dest(dxbc::Dest::R(control_temp, 0b0010));
-      dxbc::Src endian_src(dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
-      // Extract endianness into control_temp.y.
-      a_.OpAnd(endian_dest, dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ),
-               dxbc::Src::LU(0b111));
-
-      // Change 8-in-64 and 8-in-128 to 8-in-32.
-      for (uint32_t j = 0; j < 2; ++j) {
-        a_.OpIEq(dxbc::Dest::R(control_temp, 0b0100), endian_src,
-                 dxbc::Src::LU(uint32_t(j ? xenos::Endian128::k8in128
-                                          : xenos::Endian128::k8in64)));
-        for (uint32_t k = 0; k < eM_count; ++k) {
-          uint32_t eM_temp = eM_temps[k];
-          a_.OpMovC(dxbc::Dest::R(eM_temp),
-                    dxbc::Src::R(control_temp, dxbc::Src::kZZZZ),
-                    dxbc::Src::R(eM_temp, j ? 0b00011011 : 0b10110001),
-                    dxbc::Src::R(eM_temp));
-        }
-        a_.OpMovC(endian_dest, dxbc::Src::R(control_temp, dxbc::Src::kZZZZ),
-                  dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)),
-                  endian_src);
-      }
-
-      uint32_t swap_temp = PushSystemTemp();
-      dxbc::Dest swap_temp_dest(dxbc::Dest::R(swap_temp));
-      dxbc::Src swap_temp_src(dxbc::Src::R(swap_temp));
-
-      // 8-in-16 or one half of 8-in-32.
-      a_.OpSwitch(endian_src);
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in16)));
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)));
-      for (uint32_t j = 0; j < eM_count; ++j) {
-        dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
-        dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
-        // Temp = X0Z0.
-        a_.OpAnd(swap_temp_dest, eM_src, dxbc::Src::LU(0x00FF00FF));
-        // eM = YZW0.
-        a_.OpUShR(eM_dest, eM_src, dxbc::Src::LU(8));
-        // eM = Y0W0.
-        a_.OpAnd(eM_dest, eM_src, dxbc::Src::LU(0x00FF00FF));
-        // eM = YXWZ.
-        a_.OpUMAd(eM_dest, swap_temp_src, dxbc::Src::LU(256), eM_src);
-      }
-      a_.OpBreak();
-      a_.OpEndSwitch();
-
-      // 16-in-32 or another half of 8-in-32.
-      a_.OpSwitch(endian_src);
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)));
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k16in32)));
-      for (uint32_t j = 0; j < eM_count; ++j) {
-        dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
-        dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
-        // Temp = ZW00.
-        a_.OpUShR(swap_temp_dest, eM_src, dxbc::Src::LU(16));
-        // eM = ZWXY.
-        a_.OpBFI(eM_dest, dxbc::Src::LU(16), dxbc::Src::LU(16), eM_src,
-                 swap_temp_src);
-      }
-      a_.OpBreak();
-      a_.OpEndSwitch();
-
-      // Release swap_temp.
-      PopSystemTemp();
-    }
-    // control_temp.yz are now free.
-
-    dxbc::Dest address_dest(dxbc::Dest::R(eA_temp, 0b0001));
-    dxbc::Src address_src(dxbc::Src::R(eA_temp, dxbc::Src::kXXXX));
-    // Multiply the base address by dword size, also dropping the 0x40000000
-    // bit.
-    a_.OpIShL(address_dest, address_src, dxbc::Src::LU(2));
-    // Drop the exponent in the element index.
-    a_.OpAnd(dxbc::Dest::R(eA_temp, 0b0010),
-             dxbc::Src::R(eA_temp, dxbc::Src::kYYYY),
-             dxbc::Src::LU((1 << 23) - 1));
-    // Add the offset of the first written element to the base address.
-    a_.OpUMAd(address_dest, dxbc::Src::R(eA_temp, dxbc::Src::kYYYY),
-              element_size_src, address_src);
-    // Do the writes.
-    dxbc::Src eM_written_src(
-        dxbc::Src::R(system_temp_memexport_written_).Select(i >> 2));
-    uint32_t eM_written_base = 1u << ((i & 3) << 3);
-    for (uint32_t j = 0; j < eM_count; ++j) {
-      // Go to the next eM#.
-      uint32_t eM_relative_offset = eM_offsets[j] - (j ? eM_offsets[j - 1] : 0);
-      if (eM_relative_offset) {
-        if (eM_relative_offset == 1) {
-          a_.OpIAdd(address_dest, element_size_src, address_src);
-        } else {
-          a_.OpUMAd(address_dest, dxbc::Src::LU(eM_relative_offset),
-                    element_size_src, address_src);
-        }
-      }
-      // Check if the eM# was actually written to on the execution path.
-      a_.OpAnd(dxbc::Dest::R(control_temp, 0b0010), eM_written_src,
-               dxbc::Src::LU(eM_written_base << eM_offsets[j]));
-      a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
-      // Write the element of the needed size.
-      dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
-      a_.OpSwitch(element_size_src);
-      for (uint32_t k = 1; k <= 4; k <<= 1) {
-        a_.OpCase(dxbc::Src::LU(k * 4));
-        if (uav_index_shared_memory_ == kBindingIndexUnallocated) {
-          uav_index_shared_memory_ = uav_count_++;
-        }
-        a_.OpStoreRaw(
-            dxbc::Dest::U(uav_index_shared_memory_,
-                          uint32_t(UAVRegister::kSharedMemory), (1 << k) - 1),
-            address_src, eM_src);
-        a_.OpBreak();
-      }
-      a_.OpEndSwitch();
-      a_.OpEndIf();
-    }
-    // control_temp.y is now free.
+  // Check if the address with the correct sign and exponent was written, and
+  // that the index doesn't overflow the mantissa bits.
+  {
+    uint32_t address_check_temp = PushSystemTemp();
+    a_.OpUShR(dxbc::Dest::R(address_check_temp),
+              dxbc::Src::R(system_temp_memexport_address_),
+              dxbc::Src::LU(30, 23, 23, 23));
+    a_.OpIEq(dxbc::Dest::R(address_check_temp),
+             dxbc::Src::R(address_check_temp),
+             dxbc::Src::LU(0x1, 0x96, 0x96, 0x96));
+    a_.OpAnd(dxbc::Dest::R(address_check_temp, 0b0011),
+             dxbc::Src::R(address_check_temp),
+             dxbc::Src::R(address_check_temp, 0b1110));
+    a_.OpAnd(dxbc::Dest::R(address_check_temp, 0b0001),
+             dxbc::Src::R(address_check_temp, dxbc::Src::kXXXX),
+             dxbc::Src::R(address_check_temp, dxbc::Src::kYYYY));
+    a_.OpIf(true, dxbc::Src::R(address_check_temp, dxbc::Src::kXXXX));
+    // Release address_check_temp.
+    PopSystemTemp();
   }
 
-  // Close the inner memexport possibility conditional.
-  if (inner_condition_provided) {
+  uint8_t eM_remaining;
+  uint32_t eM_index;
+
+  // Swap red and blue components if needed.
+  {
+    uint32_t red_blue_swap_temp = PushSystemTemp();
+    a_.OpIBFE(dxbc::Dest::R(red_blue_swap_temp, 0b0001), dxbc::Src::LU(1),
+              dxbc::Src::LU(19),
+              dxbc::Src::R(system_temp_memexport_address_, dxbc::Src::kZZZZ));
+    a_.OpIf(true, dxbc::Src::R(red_blue_swap_temp, dxbc::Src::kXXXX));
+    // Release red_blue_swap_temp.
+    PopSystemTemp();
+
+    eM_remaining = export_eM;
+    while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+      eM_remaining &= ~(uint8_t(1) << eM_index);
+      a_.OpMov(
+          dxbc::Dest::R(system_temps_memexport_data_[eM_index], 0b0101),
+          dxbc::Src::R(system_temps_memexport_data_[eM_index], 0b11000110));
+    }
+
+    // Close the red/blue swap conditional.
     a_.OpEndIf();
   }
 
-  // Close the outer memexport possibility conditional.
+  uint32_t temp = PushSystemTemp();
+
+  // Extract the color format and the numeric format.
+  // temp.x = color format.
+  // temp.y = numeric format is signed.
+  // temp.z = numeric format is integer.
+  a_.OpUBFE(dxbc::Dest::R(temp, 0b0111), dxbc::Src::LU(6, 1, 1, 0),
+            dxbc::Src::LU(8, 16, 17, 0),
+            dxbc::Src::R(system_temp_memexport_address_, dxbc::Src::kZZZZ));
+
+  // Perform format packing.
+  // After the switch, temp.x must contain log2 of the number of bytes in an
+  // element, of UINT32_MAX if the format is unknown.
+  a_.OpSwitch(dxbc::Src::R(temp, dxbc::Src::kXXXX));
+  {
+    dxbc::Dest element_size_dest(dxbc::Dest::R(temp, 0b0001));
+    dxbc::Src num_format_signed(dxbc::Src::R(temp, dxbc::Src::kYYYY));
+    dxbc::Src num_format_integer(dxbc::Src::R(temp, dxbc::Src::kZZZZ));
+
+    auto flush_nan = [this, export_eM](uint32_t components) {
+      uint8_t eM_remaining = export_eM;
+      uint32_t eM_index;
+      uint32_t is_nan_temp = PushSystemTemp();
+      while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+        eM_remaining &= ~(uint8_t(1) << eM_index);
+        uint32_t eM = system_temps_memexport_data_[eM_index];
+        a_.OpNE(dxbc::Dest::R(is_nan_temp, components), dxbc::Src::R(eM),
+                dxbc::Src::R(eM));
+        a_.OpMovC(dxbc::Dest::R(eM, components), dxbc::Src::R(is_nan_temp),
+                  dxbc::Src::LF(0.0f), dxbc::Src::R(eM));
+      }
+      // Release is_nan_temp.
+      PopSystemTemp();
+    };
+
+    // The result will be in eM#.x. The widths must be without holes (R, RG,
+    // RGB, RGBA), and expecting the widths to add up to the size of the stored
+    // texel (8, 16 or 32 bits), as the unused upper bits will contain junk from
+    // the sign extension of X if the number is signed.
+    auto pack_8_16_32 = [&](std::array<uint32_t, 4> widths) {
+      uint8_t eM_remaining;
+      uint32_t eM_index;
+
+      uint32_t components = 0;
+      std::array<uint32_t, 4> offsets = {};
+      for (uint32_t i = 0; i < 4; ++i) {
+        if (widths[i]) {
+          // Only formats for which max + 0.5 can be represented exactly.
+          assert(widths[i] <= 23);
+          components |= uint32_t(1) << i;
+        }
+        if (i) {
+          offsets[i] = offsets[i - 1] + widths[i - 1];
+        }
+      }
+      // Will be packing components into eM#.x starting from green, assume red
+      // will already be there after the conversion.
+      assert_not_zero(components & 0b1);
+
+      flush_nan(components);
+
+      a_.OpIf(true, num_format_signed);
+      {
+        // Signed.
+        a_.OpIf(true, num_format_integer);
+        {
+          // Signed integer.
+          float min_value[4] = {}, max_value[4] = {};
+          for (uint32_t i = 0; i < 4; ++i) {
+            if (widths[i]) {
+              max_value[i] = float((uint32_t(1) << (widths[i] - 1)) - 1);
+              min_value[i] = -1.0f - max_value[i];
+            }
+          }
+          dxbc::Src min_value_src(dxbc::Src::LP(min_value));
+          dxbc::Src max_value_src(dxbc::Src::LP(max_value));
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            a_.OpMax(dxbc::Dest::R(eM, components), min_value_src,
+                     dxbc::Src::R(eM));
+            a_.OpMin(dxbc::Dest::R(eM, components), max_value_src,
+                     dxbc::Src::R(eM));
+          }
+        }
+        a_.OpElse();
+        {
+          // Signed normalized.
+          uint32_t scale_components = 0;
+          float scale[4] = {};
+          for (uint32_t i = 0; i < 4; ++i) {
+            if (widths[i] > 2) {
+              scale_components |= uint32_t(1) << i;
+              scale[i] = float((uint32_t(1) << (widths[i] - 1)) - 1);
+            }
+          }
+          dxbc::Src scale_src(dxbc::Src::LP(scale));
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            a_.OpMax(dxbc::Dest::R(eM, components), dxbc::Src::LF(-1.0f),
+                     dxbc::Src::R(eM));
+            a_.OpMin(dxbc::Dest::R(eM, components), dxbc::Src::LF(1.0f),
+                     dxbc::Src::R(eM));
+            if (scale_components) {
+              a_.OpMul(dxbc::Dest::R(eM, scale_components), dxbc::Src::R(eM),
+                       scale_src);
+            }
+          }
+        }
+        a_.OpEndIf();
+
+        // Add plus/minus 0.5 before truncating according to the Direct3D format
+        // conversion rules, and convert to signed integers.
+        uint32_t round_bias_temp = PushSystemTemp();
+        eM_remaining = export_eM;
+        while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+          eM_remaining &= ~(uint8_t(1) << eM_index);
+          uint32_t eM = system_temps_memexport_data_[eM_index];
+          a_.OpBFI(dxbc::Dest::R(eM, components), dxbc::Src::LU(31),
+                   dxbc::Src::LU(0), dxbc::Src::LF(0.5f), dxbc::Src::R(eM));
+          a_.OpAdd(dxbc::Dest::R(eM, components), dxbc::Src::R(eM),
+                   dxbc::Src::R(round_bias_temp));
+          a_.OpFToI(dxbc::Dest::R(eM, components), dxbc::Src::R(eM));
+        }
+        // Release round_bias_temp.
+        PopSystemTemp();
+      }
+      a_.OpElse();
+      {
+        // Unsigned.
+        a_.OpIf(true, num_format_integer);
+        {
+          // Unsigned integer.
+          float max_value[4];
+          for (uint32_t i = 0; i < 4; ++i) {
+            max_value[i] = float((uint32_t(1) << widths[i]) - 1);
+          }
+          dxbc::Src max_value_src(dxbc::Src::LP(max_value));
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            a_.OpMax(dxbc::Dest::R(eM, components), dxbc::Src::LF(0.0f),
+                     dxbc::Src::R(eM));
+            a_.OpMin(dxbc::Dest::R(eM, components), max_value_src,
+                     dxbc::Src::R(eM));
+          }
+        }
+        a_.OpElse();
+        {
+          // Unsigned normalized.
+          uint32_t scale_components = 0;
+          float scale[4] = {};
+          for (uint32_t i = 0; i < 4; ++i) {
+            if (widths[i] > 1) {
+              scale_components |= uint32_t(1) << i;
+              scale[i] = float((uint32_t(1) << widths[i]) - 1);
+            }
+          }
+          dxbc::Src scale_src(dxbc::Src::LP(scale));
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            // Saturate.
+            a_.OpMov(dxbc::Dest::R(eM, components), dxbc::Src::R(eM), true);
+            if (scale_components) {
+              a_.OpMul(dxbc::Dest::R(eM, scale_components), dxbc::Src::R(eM),
+                       scale_src);
+            }
+          }
+        }
+        a_.OpEndIf();
+
+        // Add 0.5 before truncating according to the Direct3D format conversion
+        // rules, and convert to unsigned integers.
+        eM_remaining = export_eM;
+        while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+          eM_remaining &= ~(uint8_t(1) << eM_index);
+          uint32_t eM = system_temps_memexport_data_[eM_index];
+          a_.OpAdd(dxbc::Dest::R(eM, components), dxbc::Src::R(eM),
+                   dxbc::Src::LF(0.5f));
+          a_.OpFToU(dxbc::Dest::R(eM, components), dxbc::Src::R(eM));
+        }
+      }
+      a_.OpEndIf();
+
+      // Pack into 32 bits.
+      for (uint32_t i = 0; i < 4; ++i) {
+        if (!widths[i]) {
+          continue;
+        }
+        dxbc::Src width_src(dxbc::Src::LU(widths[i]));
+        dxbc::Src offset_src(dxbc::Src::LU(offsets[i]));
+        eM_remaining = export_eM;
+        while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+          eM_remaining &= ~(uint8_t(1) << eM_index);
+          uint32_t eM = system_temps_memexport_data_[eM_index];
+          a_.OpBFI(dxbc::Dest::R(eM, 0b0001), width_src, offset_src,
+                   dxbc::Src::R(eM).Select(i),
+                   dxbc::Src::R(eM, dxbc::Src::kXXXX));
+        }
+      }
+    };
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8)));
+    // TODO(Triang3l): Investigate how input should be treated for k_8_A, k_8_B,
+    // k_8_8_8_8_A.
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_A)));
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_B)));
+    {
+      pack_8_16_32({8});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(0));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_1_5_5_5)));
+    {
+      pack_8_16_32({5, 5, 5, 1});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_5_6_5)));
+    {
+      pack_8_16_32({5, 6, 5});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_6_5_5)));
+    {
+      pack_8_16_32({5, 5, 6});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8)));
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8_A)));
+    a_.OpCase(
+        dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16)));
+    {
+      pack_8_16_32({8, 8, 8, 8});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_2_10_10_10)));
+    a_.OpCase(dxbc::Src::LU(
+        uint32_t(xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16)));
+    {
+      pack_8_16_32({10, 10, 10, 2});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8)));
+    {
+      pack_8_16_32({8, 8});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_4_4_4_4)));
+    {
+      pack_8_16_32({4, 4, 4, 4});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_10_11_11)));
+    a_.OpCase(
+        dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_10_11_11_AS_16_16_16_16)));
+    {
+      pack_8_16_32({11, 11, 10});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_11_11_10)));
+    a_.OpCase(
+        dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_11_11_10_AS_16_16_16_16)));
+    {
+      pack_8_16_32({10, 11, 11});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16)));
+    {
+      pack_8_16_32({16});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16)));
+    {
+      pack_8_16_32({16, 16});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16)));
+    {
+      flush_nan(0b1111);
+
+      a_.OpIf(true, num_format_signed);
+      {
+        // Signed.
+        a_.OpIf(true, num_format_integer);
+        {
+          // Signed integer.
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            a_.OpMax(dxbc::Dest::R(eM), dxbc::Src::LF(float(INT16_MIN)),
+                     dxbc::Src::R(eM));
+            a_.OpMin(dxbc::Dest::R(eM), dxbc::Src::LF(float(INT16_MAX)),
+                     dxbc::Src::R(eM));
+          }
+        }
+        a_.OpElse();
+        {
+          // Signed normalized.
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            a_.OpMax(dxbc::Dest::R(eM), dxbc::Src::LF(-1.0f), dxbc::Src::R(eM));
+            a_.OpMin(dxbc::Dest::R(eM), dxbc::Src::LF(1.0f), dxbc::Src::R(eM));
+            a_.OpMul(dxbc::Dest::R(eM), dxbc::Src::R(eM),
+                     dxbc::Src::LF(float(INT16_MAX)));
+          }
+        }
+        a_.OpEndIf();
+
+        // Add plus/minus 0.5 before truncating according to the Direct3D format
+        // conversion rules, and convert to signed integers.
+        uint32_t round_bias_temp = PushSystemTemp();
+        eM_remaining = export_eM;
+        while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+          eM_remaining &= ~(uint8_t(1) << eM_index);
+          uint32_t eM = system_temps_memexport_data_[eM_index];
+          a_.OpBFI(dxbc::Dest::R(eM), dxbc::Src::LU(31), dxbc::Src::LU(0),
+                   dxbc::Src::LF(0.5f), dxbc::Src::R(eM));
+          a_.OpAdd(dxbc::Dest::R(eM), dxbc::Src::R(eM),
+                   dxbc::Src::R(round_bias_temp));
+          a_.OpFToI(dxbc::Dest::R(eM), dxbc::Src::R(eM));
+        }
+        // Release round_bias_temp.
+        PopSystemTemp();
+      }
+      a_.OpElse();
+      {
+        // Unsigned.
+        a_.OpIf(true, num_format_integer);
+        {
+          // Unsigned integer.
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            a_.OpMax(dxbc::Dest::R(eM), dxbc::Src::LF(0.0f), dxbc::Src::R(eM));
+            a_.OpMin(dxbc::Dest::R(eM), dxbc::Src::LF(float(UINT16_MAX)),
+                     dxbc::Src::R(eM));
+          }
+        }
+        a_.OpElse();
+        {
+          // Unsigned normalized.
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            // Saturate.
+            a_.OpMov(dxbc::Dest::R(eM), dxbc::Src::R(eM), true);
+            a_.OpMul(dxbc::Dest::R(eM), dxbc::Src::R(eM),
+                     dxbc::Src::LF(float(UINT16_MAX)));
+          }
+        }
+        a_.OpEndIf();
+
+        // Add 0.5 before truncating according to the Direct3D format conversion
+        // rules, and convert to unsigned integers.
+        eM_remaining = export_eM;
+        while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+          eM_remaining &= ~(uint8_t(1) << eM_index);
+          uint32_t eM = system_temps_memexport_data_[eM_index];
+          a_.OpAdd(dxbc::Dest::R(eM), dxbc::Src::R(eM), dxbc::Src::LF(0.5f));
+          a_.OpFToU(dxbc::Dest::R(eM), dxbc::Src::R(eM));
+        }
+      }
+      a_.OpEndIf();
+
+      // Pack.
+      eM_remaining = export_eM;
+      while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+        eM_remaining &= ~(uint8_t(1) << eM_index);
+        uint32_t eM = system_temps_memexport_data_[eM_index];
+        a_.OpBFI(dxbc::Dest::R(eM, 0b0011), dxbc::Src::LU(16),
+                 dxbc::Src::LU(16), dxbc::Src::R(eM, 0b1101),
+                 dxbc::Src::R(eM, 0b1000));
+      }
+
+      a_.OpMov(element_size_dest, dxbc::Src::LU(3));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_FLOAT)));
+    {
+      // TODO(Triang3l): Use extended range conversion.
+      eM_remaining = export_eM;
+      while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+        eM_remaining &= ~(uint8_t(1) << eM_index);
+        uint32_t eM = system_temps_memexport_data_[eM_index];
+        a_.OpF32ToF16(dxbc::Dest::R(eM, 0b0001),
+                      dxbc::Src::R(eM, dxbc::Src::kXXXX));
+      }
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_FLOAT)));
+    {
+      // TODO(Triang3l): Use extended range conversion.
+      eM_remaining = export_eM;
+      while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+        eM_remaining &= ~(uint8_t(1) << eM_index);
+        uint32_t eM = system_temps_memexport_data_[eM_index];
+        a_.OpF32ToF16(dxbc::Dest::R(eM, 0b0011), dxbc::Src::R(eM));
+        a_.OpBFI(dxbc::Dest::R(eM, 0b0001), dxbc::Src::LU(16),
+                 dxbc::Src::LU(16), dxbc::Src::R(eM, dxbc::Src::kYYYY),
+                 dxbc::Src::R(eM, dxbc::Src::kXXXX));
+      }
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16_FLOAT)));
+    {
+      // TODO(Triang3l): Use extended range conversion.
+      eM_remaining = export_eM;
+      while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+        eM_remaining &= ~(uint8_t(1) << eM_index);
+        uint32_t eM = system_temps_memexport_data_[eM_index];
+        a_.OpF32ToF16(dxbc::Dest::R(eM), dxbc::Src::R(eM));
+        a_.OpBFI(dxbc::Dest::R(eM, 0b0011), dxbc::Src::LU(16),
+                 dxbc::Src::LU(16), dxbc::Src::R(eM, 0b1101),
+                 dxbc::Src::R(eM, 0b1000));
+      }
+      a_.OpMov(element_size_dest, dxbc::Src::LU(3));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_FLOAT)));
+    {
+      // Already in eM#.
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_FLOAT)));
+    {
+      // Already in eM#.
+      a_.OpMov(element_size_dest, dxbc::Src::LU(3));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_32_32_FLOAT)));
+    {
+      // Already in eM#.
+      a_.OpMov(element_size_dest, dxbc::Src::LU(4));
+    }
+    a_.OpBreak();
+
+    a_.OpDefault();
+    a_.OpMov(element_size_dest, dxbc::Src::LU(UINT32_MAX));
+    a_.OpBreak();
+  }
+  // Close the color format switch.
+  a_.OpEndSwitch();
+
+  dxbc::Src element_size_src(dxbc::Src::R(temp, dxbc::Src::kXXXX));
+
+  // Only temp.x is used currently (for the element size log2).
+
+  // Do endian swap, using temp.y for the endianness value, and temp.z as a
+  // temporary value.
+  {
+    dxbc::Dest endian_dest(dxbc::Dest::R(temp, 0b0010));
+    dxbc::Src endian_src(dxbc::Src::R(temp, dxbc::Src::kYYYY));
+    // Extract endianness into temp.y.
+    a_.OpUBFE(endian_dest, dxbc::Src::LU(3), dxbc::Src::LU(0),
+              dxbc::Src::R(system_temp_memexport_address_, dxbc::Src::kZZZZ));
+
+    // Change 8-in-64 and 8-in-128 to 8-in-32.
+    for (uint32_t i = 0; i < 2; ++i) {
+      a_.OpIEq(dxbc::Dest::R(temp, 0b0100), endian_src,
+               dxbc::Src::LU(uint32_t(i ? xenos::Endian128::k8in128
+                                        : xenos::Endian128::k8in64)));
+      eM_remaining = export_eM;
+      while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+        eM_remaining &= ~(uint8_t(1) << eM_index);
+        uint32_t eM = system_temps_memexport_data_[eM_index];
+        a_.OpMovC(dxbc::Dest::R(eM), dxbc::Src::R(temp, dxbc::Src::kZZZZ),
+                  dxbc::Src::R(eM, i ? 0b00011011 : 0b10110001),
+                  dxbc::Src::R(eM));
+      }
+      a_.OpMovC(endian_dest, dxbc::Src::R(temp, dxbc::Src::kZZZZ),
+                dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)), endian_src);
+    }
+
+    uint32_t swap_temp = PushSystemTemp();
+    dxbc::Dest swap_temp_dest(dxbc::Dest::R(swap_temp));
+    dxbc::Src swap_temp_src(dxbc::Src::R(swap_temp));
+
+    // 8-in-16 or one half of 8-in-32.
+    a_.OpSwitch(endian_src);
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in16)));
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)));
+    eM_remaining = export_eM;
+    while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+      eM_remaining &= ~(uint8_t(1) << eM_index);
+      uint32_t eM = system_temps_memexport_data_[eM_index];
+      dxbc::Dest eM_dest(dxbc::Dest::R(eM));
+      dxbc::Src eM_src(dxbc::Src::R(eM));
+      // Temp = X0Z0.
+      a_.OpAnd(swap_temp_dest, eM_src, dxbc::Src::LU(0x00FF00FF));
+      // eM = YZW0.
+      a_.OpUShR(eM_dest, eM_src, dxbc::Src::LU(8));
+      // eM = Y0W0.
+      a_.OpAnd(eM_dest, eM_src, dxbc::Src::LU(0x00FF00FF));
+      // eM = YXWZ.
+      a_.OpUMAd(eM_dest, swap_temp_src, dxbc::Src::LU(256), eM_src);
+    }
+    a_.OpBreak();
+    a_.OpEndSwitch();
+
+    // 16-in-32 or another half of 8-in-32.
+    a_.OpSwitch(endian_src);
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)));
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k16in32)));
+    eM_remaining = export_eM;
+    while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+      eM_remaining &= ~(uint8_t(1) << eM_index);
+      uint32_t eM = system_temps_memexport_data_[eM_index];
+      dxbc::Dest eM_dest(dxbc::Dest::R(eM));
+      dxbc::Src eM_src(dxbc::Src::R(eM));
+      // Temp = ZW00.
+      a_.OpUShR(swap_temp_dest, eM_src, dxbc::Src::LU(16));
+      // eM = ZWXY.
+      a_.OpBFI(eM_dest, dxbc::Src::LU(16), dxbc::Src::LU(16), eM_src,
+               swap_temp_src);
+    }
+    a_.OpBreak();
+    a_.OpEndSwitch();
+
+    // Release swap_temp.
+    PopSystemTemp();
+  }
+
+  // Extract the base index to temp.y and the index upper bound to temp.z.
+  a_.OpUBFE(dxbc::Dest::R(temp, 0b0110), dxbc::Src::LU(23), dxbc::Src::LU(0),
+            dxbc::Src::R(system_temp_memexport_address_, 0b1101 << 2));
+  dxbc::Dest eM0_address_dest(dxbc::Dest::R(temp, 0b0010));
+  dxbc::Src eM0_address_src(dxbc::Src::R(temp, dxbc::Src::kYYYY));
+  dxbc::Src index_count_src(dxbc::Src::R(temp, dxbc::Src::kZZZZ));
+
+  // Check if eM0 isn't out of bounds via temp.w - if it is, eM1...4 also are
+  // (the base index can't be negative).
+  a_.OpILT(dxbc::Dest::R(temp, 0b1000), eM0_address_src, index_count_src);
+  a_.OpIf(true, dxbc::Src::R(temp, dxbc::Src::kWWWW));
+
+  // Extract the base address to temp.w as bytes (30 lower bits to 30 upper bits
+  // with 0 below).
+  a_.OpIShL(dxbc::Dest::R(temp, 0b1000),
+            dxbc::Src::R(system_temp_memexport_address_, dxbc::Src::kXXXX),
+            dxbc::Src::LU(2));
+  dxbc::Src base_address_src(dxbc::Src::R(temp, dxbc::Src::kWWWW));
+
+  uint8_t export_eM14 = export_eM >> 1;
+  assert_zero(export_eM14 >> 4);
+  uint32_t eM14_address_temp = UINT32_MAX, store_eM14_temp = UINT32_MAX;
+  if (export_eM14) {
+    // Get eM1...4 indices and check if they're in bounds.
+    eM14_address_temp = PushSystemTemp();
+    dxbc::Dest eM14_address_dest(dxbc::Dest::R(eM14_address_temp, export_eM14));
+    dxbc::Src eM14_address_src(dxbc::Src::R(eM14_address_temp));
+    store_eM14_temp = PushSystemTemp();
+    dxbc::Dest store_eM14_dest(dxbc::Dest::R(store_eM14_temp, export_eM14));
+    dxbc::Src store_eM14_src(dxbc::Src::R(store_eM14_temp));
+    a_.OpIAdd(eM14_address_dest, eM0_address_src, dxbc::Src::LU(1, 2, 3, 4));
+    a_.OpILT(store_eM14_dest, eM14_address_src, index_count_src);
+    // Check if eM1...4 were actually written by the invocation and merge the
+    // result with store_eM14_temp.
+    uint32_t eM14_written_temp = PushSystemTemp();
+    a_.OpIBFE(dxbc::Dest::R(eM14_written_temp, export_eM14), dxbc::Src::LU(1),
+              dxbc::Src::LU(1, 2, 3, 4),
+              dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_,
+                           dxbc::Src::kYYYY));
+    a_.OpAnd(store_eM14_dest, store_eM14_src, dxbc::Src::R(eM14_written_temp));
+    // Release eM14_written_temp.
+    PopSystemTemp();
+    // Convert eM1...4 indices to global byte addresses.
+    a_.OpIShL(eM14_address_dest, eM14_address_src, element_size_src);
+    a_.OpIAdd(eM14_address_dest, base_address_src, eM14_address_src);
+  }
+  if (export_eM & 0b1) {
+    // Convert eM0 index to a global byte address if it's needed.
+    a_.OpIShL(eM0_address_dest, eM0_address_src, element_size_src);
+    a_.OpIAdd(eM0_address_dest, base_address_src, eM0_address_src);
+    // base_address_src and index_count_src are deallocated at this point (even
+    // if eM0 isn't potentially written), temp.zw are now free.
+    // Extract if eM0 was actually written by the invocation to temp.z.
+    a_.OpIBFE(dxbc::Dest::R(temp, 0b0100), dxbc::Src::LU(1), dxbc::Src::LU(0),
+              dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_,
+                           dxbc::Src::kYYYY));
+  }
+  dxbc::Src eM0_written_src(dxbc::Src::R(temp, dxbc::Src::kZZZZ));
+
+  // Write depending on the element size.
+  // No switch case will be entered for an unknown format (UINT32_MAX size
+  // written), so writing won't be attempted for it.
+  if (uav_index_shared_memory_ == kBindingIndexUnallocated) {
+    uav_index_shared_memory_ = uav_count_++;
+  }
+  uint8_t eM14_remaining;
+  uint32_t eM14_index;
+  a_.OpSwitch(element_size_src);
+
+  // 8bpp, 16bpp.
+  dxbc::Dest atomic_dest(dxbc::Dest::U(
+      uav_index_shared_memory_, uint32_t(UAVRegister::kSharedMemory), 0));
+  for (uint32_t i = 0; i <= 1; ++i) {
+    a_.OpCase(dxbc::Src::LU(i));
+    dxbc::Src width_src(dxbc::Src::LU(8 << i));
+    uint32_t sub_dword_temp = PushSystemTemp();
+    if (export_eM & 0b1) {
+      a_.OpIf(true, eM0_written_src);
+      // sub_dword_temp.x = eM0 offset in the dword (8 << (byte_address & 3))
+      // (assuming a little-endian host).
+      a_.OpBFI(dxbc::Dest::R(sub_dword_temp, 0b0001), dxbc::Src::LU(2),
+               dxbc::Src::LU(3), eM0_address_src, dxbc::Src::LU(0));
+      // Keep only the dword part of the address.
+      a_.OpAnd(eM0_address_dest, eM0_address_src, dxbc::Src::LU(~uint32_t(3)));
+      // Erase the bits that will be replaced with eM0 via sub_dword_temp.y.
+      a_.OpBFI(dxbc::Dest::R(sub_dword_temp, 0b0010), width_src,
+               dxbc::Src::R(sub_dword_temp, dxbc::Src::kXXXX), dxbc::Src::LU(0),
+               dxbc::Src::LU(UINT32_MAX));
+      a_.OpAtomicAnd(atomic_dest, eM0_address_src, 0b0001,
+                     dxbc::Src::R(sub_dword_temp, dxbc::Src::kYYYY));
+      // Add the eM0 bits via sub_dword_temp.y.
+      a_.OpBFI(dxbc::Dest::R(sub_dword_temp, 0b0010), width_src,
+               dxbc::Src::R(sub_dword_temp, dxbc::Src::kXXXX),
+               dxbc::Src::R(system_temps_memexport_data_[0], dxbc::Src::kXXXX),
+               dxbc::Src::LU(0));
+      a_.OpAtomicOr(atomic_dest, eM0_address_src, 0b0001,
+                    dxbc::Src::R(sub_dword_temp, dxbc::Src::kYYYY));
+      a_.OpEndIf();
+    }
+    if (export_eM14) {
+      // sub_dword_temp = eM# offset in the dword (8 << (byte_address & 3))
+      // (assuming a little-endian host).
+      a_.OpBFI(dxbc::Dest::R(sub_dword_temp, export_eM14), dxbc::Src::LU(2),
+               dxbc::Src::LU(3), dxbc::Src::R(eM14_address_temp),
+               dxbc::Src::LU(0));
+      // Keep only the dword part of the address.
+      a_.OpAnd(dxbc::Dest::R(eM14_address_temp, export_eM14),
+               dxbc::Src::R(eM14_address_temp), dxbc::Src::LU(~uint32_t(3)));
+      uint32_t sub_dword_data_temp = PushSystemTemp();
+      eM14_remaining = export_eM14;
+      while (xe::bit_scan_forward(eM14_remaining, &eM14_index)) {
+        eM14_remaining &= ~(uint8_t(1) << eM14_index);
+        a_.OpIf(true, dxbc::Src::R(store_eM14_temp).Select(eM14_index));
+        // Erase the bits that will be replaced with eM# via
+        // sub_dword_data_temp.x.
+        a_.OpBFI(dxbc::Dest::R(sub_dword_data_temp, 0b0001), width_src,
+                 dxbc::Src::R(sub_dword_temp).Select(eM14_index),
+                 dxbc::Src::LU(0), dxbc::Src::LU(UINT32_MAX));
+        a_.OpAtomicAnd(
+            atomic_dest, dxbc::Src::R(eM14_address_temp).Select(eM14_index),
+            0b0001, dxbc::Src::R(sub_dword_data_temp, dxbc::Src::kXXXX));
+        // Add the eM# bits via sub_dword_temp.y.
+        a_.OpBFI(dxbc::Dest::R(sub_dword_data_temp, 0b0001), width_src,
+                 dxbc::Src::R(sub_dword_temp).Select(eM14_index),
+                 dxbc::Src::R(system_temps_memexport_data_[1 + eM14_index],
+                              dxbc::Src::kXXXX),
+                 dxbc::Src::LU(0));
+        a_.OpAtomicOr(
+            atomic_dest, dxbc::Src::R(eM14_address_temp).Select(eM14_index),
+            0b0001, dxbc::Src::R(sub_dword_data_temp, dxbc::Src::kXXXX));
+        a_.OpEndIf();
+      }
+      // Release sub_dword_data_temp.
+      PopSystemTemp();
+    }
+    // Release sub_dword_temp.
+    PopSystemTemp();
+    a_.OpBreak();
+  }
+
+  // 32bpp, 64bpp, 128bpp.
+  for (uint32_t i = 2; i <= 4; ++i) {
+    a_.OpCase(dxbc::Src::LU(i));
+    // Store (0b0001), Store2 (0b0011), Store4 (0b1111).
+    uint32_t store_mask = (uint32_t(1) << (uint32_t(1) << (i - 2))) - 1;
+    dxbc::Dest store_dest(dxbc::Dest::U(uav_index_shared_memory_,
+                                        uint32_t(UAVRegister::kSharedMemory),
+                                        store_mask));
+    if (export_eM & 0b1) {
+      a_.OpIf(true, eM0_written_src);
+      a_.OpStoreRaw(store_dest, eM0_address_src,
+                    dxbc::Src::R(system_temps_memexport_data_[0]));
+      a_.OpEndIf();
+    }
+    eM14_remaining = export_eM14;
+    while (xe::bit_scan_forward(eM14_remaining, &eM14_index)) {
+      eM14_remaining &= ~(uint8_t(1) << eM14_index);
+      a_.OpIf(true, dxbc::Src::R(store_eM14_temp).Select(eM14_index));
+      a_.OpStoreRaw(store_dest,
+                    dxbc::Src::R(eM14_address_temp).Select(eM14_index),
+                    dxbc::Src::R(system_temps_memexport_data_[1 + eM14_index]));
+      a_.OpEndIf();
+    }
+    a_.OpBreak();
+  }
+
+  // Close the element size switch.
+  a_.OpEndSwitch();
+
+  if (export_eM14) {
+    // Release eM14_address_temp and store_eM14_temp.
+    PopSystemTemp(2);
+  }
+
+  // Close the eM0 bounds check.
   a_.OpEndIf();
 
-  // Release control_temp.
+  // Release temp.
   PopSystemTemp();
+
+  // Close the address correctness conditional.
+  a_.OpEndIf();
+
+  // Close the memory export allowed conditional.
+  a_.OpEndIf();
 }
 
 }  // namespace gpu
diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index e9c21d801..c3d57438d 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -672,7 +672,7 @@ class Shader {
     // For implementation without unconditional support for memory writes from
     // vertex shaders, vertex shader converted to a compute shader doing only
     // memory export.
-    kMemexportCompute,
+    kMemExportCompute,
 
     // 4 host vertices for 1 guest vertex, for implementations without
     // unconditional geometry shader support.
@@ -769,9 +769,16 @@ class Shader {
     }
   };
 
-  // Based on the number of AS_VS/PS_EXPORT_STREAM_* enum sets found in a game
-  // .pdb.
-  static constexpr uint32_t kMaxMemExports = 16;
+  struct ControlFlowMemExportInfo {
+    // Which eM elements have potentially (regardless of conditionals, loop
+    // iteration counts, predication) been written earlier in the predecessor
+    // graph of the instruction since an `alloc export`.
+    uint8_t eM_potentially_written_before = 0;
+    // For exec sequences, which eM elements are potentially (regardless of
+    // predication) written by the instructions in the sequence. For other
+    // control flow instructions, it's 0.
+    uint8_t eM_potentially_written_by_exec = 0;
+  };
 
   class Translation {
    public:
@@ -879,19 +886,21 @@ class Shader {
     return constant_register_map_;
   }
 
-  // uint5[Shader::kMaxMemExports] - bits indicating which eM# registers have
-  // been written to after each `alloc export`, for up to Shader::kMaxMemExports
-  // exports. This will contain zero for certain corrupt exports - for those to
-  // which a valid eA was not written via a MAD with a stream constant.
-  const uint8_t* memexport_eM_written() const { return memexport_eM_written_; }
+  // Information about memory export state at each control flow instruction. May
+  // be empty if there are no eM# writes.
+  const std::vector<ControlFlowMemExportInfo>& cf_memexport_info() const {
+    return cf_memexport_info_;
+  }
 
-  // All c# registers used as the addend in MAD operations to eA.
+  uint8_t memexport_eM_written() const { return memexport_eM_written_; }
+  uint8_t memexport_eM_potentially_written_before_end() const {
+    return memexport_eM_potentially_written_before_end_;
+  }
+
+  // c# registers used as the addend in MAD operations to eA.
   const std::set<uint32_t>& memexport_stream_constants() const {
     return memexport_stream_constants_;
   }
-  bool is_valid_memexport_used() const {
-    return !memexport_stream_constants_.empty();
-  }
 
   // Labels that jumps (explicit or from loops) can be done to.
   const std::set<uint32_t>& label_addresses() const { return label_addresses_; }
@@ -969,7 +978,7 @@ class Shader {
     // TODO(Triang3l): Investigate what happens to memexport when the pixel
     // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early
     // depth/stencil.
-    return !kills_pixels() && !writes_depth() && !is_valid_memexport_used();
+    return !kills_pixels() && !writes_depth() && !memexport_eM_written();
   }
 
   // Whether each color render target is written to on any execution path.
@@ -1041,8 +1050,6 @@ class Shader {
   std::vector<VertexBinding> vertex_bindings_;
   std::vector<TextureBinding> texture_bindings_;
   ConstantRegisterMap constant_register_map_ = {0};
-  uint8_t memexport_eM_written_[kMaxMemExports] = {};
-  std::set<uint32_t> memexport_stream_constants_;
   std::set<uint32_t> label_addresses_;
   uint32_t cf_pair_index_bound_ = 0;
   uint32_t register_static_address_bound_ = 0;
@@ -1054,6 +1061,17 @@ class Shader {
   bool uses_texture_fetch_instruction_results_ = false;
   bool writes_depth_ = false;
 
+  // Memory export eM write info for each control flow instruction, if there are
+  // any eM writes in the shader.
+  std::vector<ControlFlowMemExportInfo> cf_memexport_info_;
+  // Which memexport elements (eM#) are written for any memexport in the shader.
+  uint8_t memexport_eM_written_ = 0;
+  // ControlFlowMemExportInfo::eM_potentially_written_before equivalent for the
+  // end of the shader, for the last memory export (or exports if the end has
+  // multiple predecessor chains exporting to memory).
+  uint8_t memexport_eM_potentially_written_before_end_ = 0;
+  std::set<uint32_t> memexport_stream_constants_;
+
   // Modification bits -> translation.
   std::unordered_map<uint64_t, Translation*> translations_;
 
@@ -1063,8 +1081,7 @@ class Shader {
   void GatherExecInformation(
       const ParsedExecInstruction& instr,
       ucode::VertexFetchInstruction& previous_vfetch_full,
-      uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
-      uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer);
+      uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer);
   void GatherVertexFetchInformation(
       const ucode::VertexFetchInstruction& op,
       ucode::VertexFetchInstruction& previous_vfetch_full,
@@ -1073,13 +1090,12 @@ class Shader {
                                      uint32_t& unique_texture_bindings,
                                      StringBuffer& ucode_disasm_buffer);
   void GatherAluInstructionInformation(const ucode::AluInstruction& op,
-                                       uint32_t memexport_alloc_current_count,
-                                       uint32_t& memexport_eA_written,
+                                       uint32_t exec_cf_index,
                                        StringBuffer& ucode_disasm_buffer);
   void GatherOperandInformation(const InstructionOperand& operand);
   void GatherFetchResultInformation(const InstructionResult& result);
   void GatherAluResultInformation(const InstructionResult& result,
-                                  uint32_t memexport_alloc_current_count);
+                                  uint32_t exec_cf_index);
 };
 
 }  // namespace gpu
diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index 88c7f95f4..d381edc94 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -87,8 +87,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
   VertexFetchInstruction previous_vfetch_full;
   std::memset(&previous_vfetch_full, 0, sizeof(previous_vfetch_full));
   uint32_t unique_texture_bindings = 0;
-  uint32_t memexport_alloc_count = 0;
-  uint32_t memexport_eA_written = 0;
   for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
     ControlFlowInstruction cf_ab[2];
     UnpackControlFlowInstructions(ucode_data_.data() + i * 3, cf_ab);
@@ -111,8 +109,7 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
           ParsedExecInstruction instr;
           ParseControlFlowExec(cf.exec, cf_index, instr);
           GatherExecInformation(instr, previous_vfetch_full,
-                                unique_texture_bindings, memexport_alloc_count,
-                                memexport_eA_written, ucode_disasm_buffer);
+                                unique_texture_bindings, ucode_disasm_buffer);
         } break;
         case ControlFlowOpcode::kCondExec:
         case ControlFlowOpcode::kCondExecEnd:
@@ -122,16 +119,14 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
           ParsedExecInstruction instr;
           ParseControlFlowCondExec(cf.cond_exec, cf_index, instr);
           GatherExecInformation(instr, previous_vfetch_full,
-                                unique_texture_bindings, memexport_alloc_count,
-                                memexport_eA_written, ucode_disasm_buffer);
+                                unique_texture_bindings, ucode_disasm_buffer);
         } break;
         case ControlFlowOpcode::kCondExecPred:
         case ControlFlowOpcode::kCondExecPredEnd: {
           ParsedExecInstruction instr;
           ParseControlFlowCondExecPred(cf.cond_exec_pred, cf_index, instr);
           GatherExecInformation(instr, previous_vfetch_full,
-                                unique_texture_bindings, memexport_alloc_count,
-                                memexport_eA_written, ucode_disasm_buffer);
+                                unique_texture_bindings, ucode_disasm_buffer);
         } break;
         case ControlFlowOpcode::kLoopStart: {
           ParsedLoopStartInstruction instr;
@@ -173,9 +168,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
           ParseControlFlowAlloc(cf.alloc, cf_index,
                                 type() == xenos::ShaderType::kVertex, instr);
           instr.Disassemble(&ucode_disasm_buffer);
-          if (instr.type == AllocType::kMemory) {
-            ++memexport_alloc_count;
-          }
         } break;
         case ControlFlowOpcode::kMarkVsFetchDone:
           break;
@@ -187,7 +179,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
         constant_register_map_.bool_bitmap[bool_constant_index / 32] |=
             uint32_t(1) << (bool_constant_index % 32);
       }
-      // TODO(benvanik): break if (DoesControlFlowOpcodeEndShader(cf.opcode()))?
     }
   }
   ucode_disassembly_ = ucode_disasm_buffer.to_string();
@@ -206,17 +197,125 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
     }
   }
 
-  // Cleanup invalid/unneeded memexport allocs.
-  for (uint32_t i = 0; i < kMaxMemExports; ++i) {
-    if (!(memexport_eA_written & (uint32_t(1) << i))) {
-      memexport_eM_written_[i] = 0;
-    } else if (!memexport_eM_written_[i]) {
-      memexport_eA_written &= ~(uint32_t(1) << i);
+  if (!cf_memexport_info_.empty()) {
+    // Gather potentially "dirty" memexport elements before each control flow
+    // instruction. `alloc` (any, not only `export`) flushes the previous memory
+    // export. On the guest GPU, yielding / serializing also terminates memory
+    // exports, but for simplicity disregarding that, as that functionally does
+    // nothing compared to flushing the previous memory export only at `alloc`
+    // or even only specifically at `alloc export`, Microsoft's validator checks
+    // if eM# aren't written after a `serialize`.
+    std::vector<uint32_t> successor_stack;
+    for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
+      ControlFlowInstruction eM_writing_cf_ab[2];
+      UnpackControlFlowInstructions(ucode_data_.data() + i * 3,
+                                    eM_writing_cf_ab);
+      for (uint32_t j = 0; j < 2; ++j) {
+        uint32_t eM_writing_cf_index = i * 2 + j;
+        uint32_t eM_written_by_cf_instr =
+            cf_memexport_info_[eM_writing_cf_index]
+                .eM_potentially_written_by_exec;
+        if (eM_writing_cf_ab[j].opcode() == ControlFlowOpcode::kCondCall) {
+          // Until subroutine calls are handled accurately, assume that all eM#
+          // have potentially been written by the subroutine for simplicity.
+          eM_written_by_cf_instr = memexport_eM_written_;
+        }
+        if (!eM_written_by_cf_instr) {
+          continue;
+        }
+
+        // If the control flow instruction potentially results in any eM# being
+        // written, mark those eM# as potentially written before each successor.
+        bool is_successor_graph_head = true;
+        successor_stack.push_back(eM_writing_cf_index);
+        while (!successor_stack.empty()) {
+          uint32_t successor_cf_index = successor_stack.back();
+          successor_stack.pop_back();
+
+          ControlFlowMemExportInfo& successor_memexport_info =
+              cf_memexport_info_[successor_cf_index];
+          if ((successor_memexport_info.eM_potentially_written_before &
+               eM_written_by_cf_instr) == eM_written_by_cf_instr) {
+            // Already marked as written before this instruction (and thus
+            // before all its successors too). Possibly this instruction is in a
+            // loop, in this case an instruction may succeed itself.
+            break;
+          }
+          // The first instruction in the traversal is the writing instruction
+          // itself, not its successor. However, if it has been visited by the
+          // traversal twice, it's in a loop, so it succeeds itself, and thus
+          // writes from it are potentially done before it too.
+          if (!is_successor_graph_head) {
+            successor_memexport_info.eM_potentially_written_before |=
+                eM_written_by_cf_instr;
+          }
+          is_successor_graph_head = false;
+
+          ControlFlowInstruction successor_cf_ab[2];
+          UnpackControlFlowInstructions(
+              ucode_data_.data() + (successor_cf_index >> 1) * 3,
+              successor_cf_ab);
+          const ControlFlowInstruction& successor_cf =
+              successor_cf_ab[successor_cf_index & 1];
+
+          bool next_instr_is_new_successor = true;
+          switch (successor_cf.opcode()) {
+            case ControlFlowOpcode::kExecEnd:
+              // One successor: end.
+              memexport_eM_potentially_written_before_end_ |=
+                  eM_written_by_cf_instr;
+              next_instr_is_new_successor = false;
+              break;
+            case ControlFlowOpcode::kCondExecEnd:
+            case ControlFlowOpcode::kCondExecPredEnd:
+            case ControlFlowOpcode::kCondExecPredCleanEnd:
+              // Two successors: next, end.
+              memexport_eM_potentially_written_before_end_ |=
+                  eM_written_by_cf_instr;
+              break;
+            case ControlFlowOpcode::kLoopStart:
+              // Two successors: next, skip.
+              successor_stack.push_back(successor_cf.loop_start.address());
+              break;
+            case ControlFlowOpcode::kLoopEnd:
+              // Two successors: next, repeat.
+              successor_stack.push_back(successor_cf.loop_end.address());
+              break;
+            case ControlFlowOpcode::kCondCall:
+              // Two successors: next, target.
+              successor_stack.push_back(successor_cf.cond_call.address());
+              break;
+            case ControlFlowOpcode::kReturn:
+              // Currently treating all subroutine calls as potentially writing
+              // all eM# for simplicity, so just exit the subroutine.
+              next_instr_is_new_successor = false;
+              break;
+            case ControlFlowOpcode::kCondJmp:
+              // One or two successors: next if conditional, target.
+              successor_stack.push_back(successor_cf.cond_jmp.address());
+              if (successor_cf.cond_jmp.is_unconditional()) {
+                next_instr_is_new_successor = false;
+              }
+              break;
+            case ControlFlowOpcode::kAlloc:
+              // Any `alloc` ends the previous export.
+              next_instr_is_new_successor = false;
+              break;
+            default:
+              break;
+          }
+          if (next_instr_is_new_successor) {
+            if (successor_cf_index < (cf_pair_index_bound_ << 1)) {
+              successor_stack.push_back(successor_cf_index + 1);
+            } else {
+              memexport_eM_potentially_written_before_end_ |=
+                  eM_written_by_cf_instr;
+            }
+          }
+        }
+      }
     }
   }
-  if (memexport_eA_written == 0) {
-    memexport_stream_constants_.clear();
-  }
 
   is_ucode_analyzed_ = true;
 
@@ -250,8 +349,7 @@ uint32_t Shader::GetInterpolatorInputMask(reg::SQ_PROGRAM_CNTL sq_program_cntl,
 void Shader::GatherExecInformation(
     const ParsedExecInstruction& instr,
     ucode::VertexFetchInstruction& previous_vfetch_full,
-    uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
-    uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
+    uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer) {
   instr.Disassemble(&ucode_disasm_buffer);
   uint32_t sequence = instr.sequence;
   for (uint32_t instr_offset = instr.instruction_address;
@@ -273,8 +371,7 @@ void Shader::GatherExecInformation(
       }
     } else {
       auto& op = *reinterpret_cast<const AluInstruction*>(op_ptr);
-      GatherAluInstructionInformation(op, memexport_alloc_current_count,
-                                      memexport_eA_written,
+      GatherAluInstructionInformation(op, instr.dword_index,
                                       ucode_disasm_buffer);
     }
   }
@@ -381,8 +478,8 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op,
 }
 
 void Shader::GatherAluInstructionInformation(
-    const AluInstruction& op, uint32_t memexport_alloc_current_count,
-    uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
+    const AluInstruction& op, uint32_t exec_cf_index,
+    StringBuffer& ucode_disasm_buffer) {
   ParsedAluInstruction instr;
   ParseAluInstruction(op, type(), instr);
   instr.Disassemble(&ucode_disasm_buffer);
@@ -394,10 +491,8 @@ void Shader::GatherAluInstructionInformation(
       (ucode::GetAluScalarOpcodeInfo(op.scalar_opcode()).changed_state &
        ucode::kAluOpChangedStatePixelKill);
 
-  GatherAluResultInformation(instr.vector_and_constant_result,
-                             memexport_alloc_current_count);
-  GatherAluResultInformation(instr.scalar_result,
-                             memexport_alloc_current_count);
+  GatherAluResultInformation(instr.vector_and_constant_result, exec_cf_index);
+  GatherAluResultInformation(instr.scalar_result, exec_cf_index);
   for (size_t i = 0; i < instr.vector_operand_count; ++i) {
     GatherOperandInformation(instr.vector_operands[i]);
   }
@@ -405,9 +500,7 @@ void Shader::GatherAluInstructionInformation(
     GatherOperandInformation(instr.scalar_operands[i]);
   }
 
-  // Store used memexport constants because CPU code needs addresses and sizes,
-  // and also whether there have been writes to eA and eM# for register
-  // allocation in shader translator implementations.
+  // Store used memexport constants because CPU code needs addresses and sizes.
   // eA is (hopefully) always written to using:
   // mad eA, r#, const0100, c#
   // (though there are some exceptions, shaders in 4D5307E6 for some reason set
@@ -416,13 +509,9 @@ void Shader::GatherAluInstructionInformation(
   // Export is done to vector_dest of the ucode instruction for both vector and
   // scalar operations - no need to check separately.
   if (instr.vector_and_constant_result.storage_target ==
-          InstructionStorageTarget::kExportAddress &&
-      memexport_alloc_current_count > 0 &&
-      memexport_alloc_current_count <= Shader::kMaxMemExports) {
+      InstructionStorageTarget::kExportAddress) {
     uint32_t memexport_stream_constant = instr.GetMemExportStreamConstant();
     if (memexport_stream_constant != UINT32_MAX) {
-      memexport_eA_written |= uint32_t(1)
-                              << (memexport_alloc_current_count - 1);
       memexport_stream_constants_.insert(memexport_stream_constant);
     } else {
       XELOGE(
@@ -481,8 +570,8 @@ void Shader::GatherFetchResultInformation(const InstructionResult& result) {
   }
 }
 
-void Shader::GatherAluResultInformation(
-    const InstructionResult& result, uint32_t memexport_alloc_current_count) {
+void Shader::GatherAluResultInformation(const InstructionResult& result,
+                                        uint32_t exec_cf_index) {
   uint32_t used_write_mask = result.GetUsedWriteMask();
   if (!used_write_mask) {
     return;
@@ -504,11 +593,12 @@ void Shader::GatherAluResultInformation(
       writes_point_size_edge_flag_kill_vertex_ |= used_write_mask;
       break;
     case InstructionStorageTarget::kExportData:
-      if (memexport_alloc_current_count > 0 &&
-          memexport_alloc_current_count <= Shader::kMaxMemExports) {
-        memexport_eM_written_[memexport_alloc_current_count - 1] |=
-            uint32_t(1) << result.storage_index;
+      memexport_eM_written_ |= uint8_t(1) << result.storage_index;
+      if (cf_memexport_info_.empty()) {
+        cf_memexport_info_.resize(2 * cf_pair_index_bound_);
       }
+      cf_memexport_info_[exec_cf_index].eM_potentially_written_by_exec |=
+          uint32_t(1) << result.storage_index;
       break;
     case InstructionStorageTarget::kColor:
       writes_color_targets_ |= uint32_t(1) << result.storage_index;
@@ -665,7 +755,13 @@ void ShaderTranslator::TranslateControlFlowInstruction(
     case ControlFlowOpcode::kAlloc: {
       ParsedAllocInstruction instr;
       ParseControlFlowAlloc(cf.alloc, cf_index_, is_vertex_shader(), instr);
-      ProcessAllocInstruction(instr);
+      const std::vector<Shader::ControlFlowMemExportInfo>& cf_memexport_info =
+          current_shader().cf_memexport_info();
+      ProcessAllocInstruction(instr,
+                              instr.dword_index < cf_memexport_info.size()
+                                  ? cf_memexport_info[instr.dword_index]
+                                        .eM_potentially_written_before
+                                  : 0);
     } break;
     case ControlFlowOpcode::kMarkVsFetchDone:
       break;
@@ -807,6 +903,14 @@ void ParseControlFlowAlloc(const ControlFlowAllocInstruction& cf,
 void ShaderTranslator::TranslateExecInstructions(
     const ParsedExecInstruction& instr) {
   ProcessExecInstructionBegin(instr);
+
+  const std::vector<Shader::ControlFlowMemExportInfo>& cf_memexport_info =
+      current_shader().cf_memexport_info();
+  uint8_t eM_potentially_written_before =
+      instr.dword_index < cf_memexport_info.size()
+          ? cf_memexport_info[instr.dword_index].eM_potentially_written_before
+          : 0;
+
   const uint32_t* ucode_dwords = current_shader().ucode_data().data();
   uint32_t sequence = instr.sequence;
   for (uint32_t instr_offset = instr.instruction_address;
@@ -832,9 +936,22 @@ void ShaderTranslator::TranslateExecInstructions(
       auto& op = *reinterpret_cast<const AluInstruction*>(op_ptr);
       ParsedAluInstruction alu_instr;
       ParseAluInstruction(op, current_shader().type(), alu_instr);
-      ProcessAluInstruction(alu_instr);
+      ProcessAluInstruction(alu_instr, eM_potentially_written_before);
+      if (alu_instr.vector_and_constant_result.storage_target ==
+              InstructionStorageTarget::kExportData &&
+          alu_instr.vector_and_constant_result.GetUsedWriteMask()) {
+        eM_potentially_written_before |=
+            uint8_t(1) << alu_instr.vector_and_constant_result.storage_index;
+      }
+      if (alu_instr.scalar_result.storage_target ==
+              InstructionStorageTarget::kExportData &&
+          alu_instr.scalar_result.GetUsedWriteMask()) {
+        eM_potentially_written_before |=
+            uint8_t(1) << alu_instr.scalar_result.storage_index;
+      }
     }
   }
+
   ProcessExecInstructionEnd(instr);
 }
 
diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h
index 0e764fe30..bcce051bd 100644
--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@@ -118,8 +118,10 @@ class ShaderTranslator {
   virtual void ProcessReturnInstruction(const ParsedReturnInstruction& instr) {}
   // Handles translation for jump instructions.
   virtual void ProcessJumpInstruction(const ParsedJumpInstruction& instr) {}
-  // Handles translation for alloc instructions.
-  virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr) {}
+  // Handles translation for alloc instructions. Memory exports for eM#
+  // indicated by export_eM must be performed, regardless of the alloc type.
+  virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr,
+                                       uint8_t export_eM) {}
 
   // Handles translation for vertex fetch instructions.
   virtual void ProcessVertexFetchInstruction(
@@ -128,7 +130,13 @@ class ShaderTranslator {
   virtual void ProcessTextureFetchInstruction(
       const ParsedTextureFetchInstruction& instr) {}
   // Handles translation for ALU instructions.
-  virtual void ProcessAluInstruction(const ParsedAluInstruction& instr) {}
+  // memexport_eM_potentially_written_before needs to be handled by `kill`
+  // instruction to make sure memory exports for the eM# writes earlier in
+  // previous execs and the current exec are done before the invocation becomes
+  // inactive.
+  virtual void ProcessAluInstruction(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before) {}
 
  private:
   void TranslateControlFlowInstruction(const ucode::ControlFlowInstruction& cf);
diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index b7da0678d..9889fb630 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -134,7 +134,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
     // (32-bit only - 16-bit indices are always fetched via the Vulkan index
     // buffer).
     kSysFlag_VertexIndexLoad = 1u << kSysFlag_VertexIndexLoad_Shift,
-    // For HostVertexShaderTypes kMemexportCompute, kPointListAsTriangleStrip,
+    // For HostVertexShaderTypes kMemExportCompute, kPointListAsTriangleStrip,
     // kRectangleListAsTriangleStrip, whether the vertex index needs to be
     // loaded from the index buffer (rather than using autogenerated indices),
     // and whether it's 32-bit. This is separate from kSysFlag_VertexIndexLoad
@@ -427,7 +427,9 @@ class SpirvShaderTranslator : public ShaderTranslator {
       const ParsedVertexFetchInstruction& instr) override;
   void ProcessTextureFetchInstruction(
       const ParsedTextureFetchInstruction& instr) override;
-  void ProcessAluInstruction(const ParsedAluInstruction& instr) override;
+  void ProcessAluInstruction(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before) override;
 
  private:
   struct TextureBinding {
@@ -620,7 +622,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
     assert_true(edram_fragment_shader_interlock_);
     return !is_depth_only_fragment_shader_ &&
            !current_shader().writes_depth() &&
-           !current_shader().is_valid_memexport_used();
+           !current_shader().memexport_eM_written();
   }
   void FSI_LoadSampleMask(spv::Id msaa_samples);
   void FSI_LoadEdramOffsets(spv::Id msaa_samples);
diff --git a/src/xenia/gpu/spirv_shader_translator_alu.cc b/src/xenia/gpu/spirv_shader_translator_alu.cc
index 47978dd00..05e41d5ab 100644
--- a/src/xenia/gpu/spirv_shader_translator_alu.cc
+++ b/src/xenia/gpu/spirv_shader_translator_alu.cc
@@ -67,7 +67,8 @@ void SpirvShaderTranslator::KillPixel(spv::Id condition) {
 }
 
 void SpirvShaderTranslator::ProcessAluInstruction(
-    const ParsedAluInstruction& instr) {
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before) {
   if (instr.IsNop()) {
     // Don't even disassemble or update predication.
     return;
diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h
index edecafe7f..5ae943f62 100644
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@@ -210,7 +210,7 @@ enum class AllocType : uint32_t {
   kVsInterpolators = 2,
   // Pixel shader exports colors.
   kPsColors = 2,
-  // MEMEXPORT?
+  // Memory export.
   kMemory = 3,
 };
 
@@ -1782,6 +1782,9 @@ inline uint32_t GetAluVectorOpNeededSourceComponents(
                           .operand_components_used[src_index - 1];
 }
 
+// eM# (kExportData) register count.
+constexpr uint32_t kMaxMemExportElementCount = 5;
+
 enum class ExportRegister : uint32_t {
   kVSInterpolator0 = 0,
   kVSInterpolator1,
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index ece50a2f2..b1697dd06 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -2187,7 +2187,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
     return false;
   }
   pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
-  bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
+  bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0;
 
   // Pixel shader analysis.
   bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h
index 8f16690b3..4c1b30534 100644
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@@ -497,6 +497,18 @@ enum class TextureFormat : uint32_t {
   k_6_5_5 = 5,
   k_8_8_8_8 = 6,
   k_2_10_10_10 = 7,
+  // Possibly similar to k_8, but may be storing alpha instead of red when
+  // resolving/memexporting, though not exactly known. From the point of view of
+  // sampling, it should be treated the same as k_8 (given that textures have
+  // the last - and single-component textures have the only - component
+  // replicated into all the remaining ones before the swizzle).
+  // Used as:
+  // - Texture in 4B4E083C - text, starting from the "Loading..." and the "This
+  //   game saves data automatically" messages. The swizzle in the fetch
+  //   constant is 111W (suggesting that internally the only component may be
+  //   the alpha one, not red).
+  // TODO(Triang3l): Investigate how k_8_A and k_8_B work in resolves and
+  // memexports, whether they store alpha/blue of the input or red.
   k_8_A = 8,
   k_8_B = 9,
   k_8_8 = 10,
@@ -510,6 +522,12 @@ enum class TextureFormat : uint32_t {
   // Used for videos in 54540829.
   k_Y1_Cr_Y0_Cb_REP = 12,
   k_16_16_EDRAM = 13,
+  // Likely same as k_8_8_8_8.
+  // Used as:
+  // - Memexport destination in 4D5308BC - multiple small draws when looking
+  //   back at the door behind the player in the first room of gameplay.
+  // - Memexport destination in 4D53085B and 4D530919 - in 4D53085B, in a frame
+  //   between the intro video and the main menu, in a 8192-point draw.
   k_8_8_8_8_A = 14,
   k_4_4_4_4 = 15,
   k_10_11_11 = 16,
@@ -1373,8 +1391,7 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
 // memexport on the Adreno 2xx using GL_OES_get_program_binary - it's also
 // interesting to see how alphatest interacts with it, whether it's still true
 // fixed-function alphatest, as it's claimed to be supported as usual by the
-// extension specification - it's likely, however, that memory exports are
-// discarded alongside other exports such as oC# and oDepth this way.
+// extension specification.
 //
 // Y of eA contains the offset in elements - this is what shaders are supposed
 // to calculate from something like the vertex index. Again, it's specified as
@@ -1397,6 +1414,69 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
 // elements using packing via addition to 2^23, so this field also doesn't need
 // more bits than that.
 //
+// According to the sequencer specification from IPR2015-00325 (where memexport
+// is called "pass thru export"):
+// - Pass thru exports can occur anywhere in the shader program.
+// - There can be any number of pass thru exports.
+// - The address register is not kept across clause boundaries, so it must be
+//   refreshed after any Serialize (or yield), allocate instruction or resource
+//   change.
+// - The write to eM# may be predicated if the export is not needed.
+// - Exports are dropped if:
+//   - The index is above the maximum.
+//   - The index sign bit is 1.
+//   - The exponent of the index is not 23.
+// The requirement that eM4 must be written if any eM# other than eM0 is also
+// written doesn't apply to the final Xenos, it's likely an outdated note in the
+// specification considering that it's very preliminary.
+//
+// According to Microsoft's shader validator:
+// - eA can be written only by `mad`.
+// - A single eM# can be written by any number of instruction, including with
+//   write masking.
+// - eA must be written before eM#.
+// - Any alloc instruction or a `serialize` terminates the current memory
+//   export. This doesn't apply to `exec Yield=true`, however, and it's not
+//   clear if that's an oversight or if that's not considered a yield that
+//   terminates the export.
+//
+// From the emulation perspective, this means that:
+// - Alloc instructions (`alloc export` mandatorily, other allocs optionally),
+//   and optionally `serialize` instructions within `exec`, should be treated as
+//   the locations where the currently open export should be flushed to the
+//   memory. It should be taken into account that an export may be in looping
+//   control flow, and in this case it must be performed at every iteration.
+// - Whether each eM# was written to must be tracked at shader execution time,
+//   as predication can disable the export of an element.
+//
+// TODO(Triang3l): Investigate how memory export interacts with pixel killing.
+// Given that eM# writes disabled by predication don't cause an export, it's
+// possible that killed invocations are treated as inactive (invalid in Xenos
+// terms) overall, and thus new memory exports from them shouldn't be done, but
+// that's not verified. However, given that on Direct3D 11+, OpenGL and Vulkan
+// hosts, discarding disables subsequent storage resource writes, on the host,
+// it would be natural to perform all outstanding memory exports before
+// discarding if the kill condition passes.
+//
+// Memory exports can be performed to any ColorFormat, including 8bpp and 16bpp
+// ones. Hosts, however, may have the memory bound as a 32bpp buffer (for
+// instance, due to the minimum resource view size limitation on Direct3D 11).
+// In this case, bytes and shorts aren't addressable directly. However, taking
+// into account that memory accesses are coherent within one shader invocation
+// on Direct3D 11+, OpenGL and Vulkan and thus are done in order relatively to
+// each other, it should be possible to implement them by clearing the bits via
+// an atomic AND, and writing the new value using an atomic OR. This will, of
+// course, make the entire write operation non-atomic, and in case of a race
+// between writes to the same location, the final result may not even be just a
+// value from one of the invocations, but rather, it can be OR of the values
+// from any invocations involved. However, on the Xenos, there doesn't seem to
+// be any possibility of meaningfully accessing the same location from multiple
+// invocations if any of them is writing, memory exports are out-of-order, so
+// such an implementation shouldn't be causing issues in reality. Atomic
+// compare-exchange, however, should not be used for this purpose, as it may
+// result in an infinite loop if different invocations want to write different
+// values to the same memory location.
+//
 // Examples of setup in titles (Z from MSB to LSB):
 //
 // 4D5307E6 particles (different VS invocation counts, like 1, 2, 4):
@@ -1432,6 +1512,11 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
 // c0: Z = 010010110000|0|010|11|011010|00011|001
 //   8in16, 16_16_16_16, uint, RGBA - from 16_16_16_16 uint vfetch
 //   (16_16_16_16 is the largest color format without special values)
+//
+// 58410B86 hierarchical depth buffer occlusion culling with the result read on
+// the CPU (15000 VS invocations in the main menu):
+// c8: Z = 010010110000|0|010|00|000010|00000|000, count = invocation count
+//   No endian swap, 8, uint, RGBA
 union alignas(uint32_t) xe_gpu_memexport_stream_t {
   struct {
     uint32_t dword_0;
diff --git a/src/xenia/kernel/xam/xam_content.cc b/src/xenia/kernel/xam/xam_content.cc
index 1b0b9c844..a08da87bd 100644
--- a/src/xenia/kernel/xam/xam_content.cc
+++ b/src/xenia/kernel/xam/xam_content.cc
@@ -119,6 +119,8 @@ dword_result_t XamContentCreateEnumerator_entry(
 }
 DECLARE_XAM_EXPORT1(XamContentCreateEnumerator, kContent, kImplemented);
 
+enum class kDispositionState : uint32_t { Unknown = 0, Create = 1, Open = 2 };
+
 dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
                                   lpvoid_t content_data_ptr,
                                   dword_t content_data_size, dword_t flags,
@@ -146,40 +148,37 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
               content_data, disposition_ptr, license_mask_ptr, overlapped_ptr](
                  uint32_t& extended_error, uint32_t& length) -> X_RESULT {
     X_RESULT result = X_ERROR_INVALID_PARAMETER;
-    bool create = false;
-    bool open = false;
+    kDispositionState disposition = kDispositionState::Unknown;
     switch (flags & 0xF) {
       case 1:  // CREATE_NEW
                // Fail if exists.
         if (content_manager->ContentExists(content_data)) {
           result = X_ERROR_ALREADY_EXISTS;
         } else {
-          create = true;
+          disposition = kDispositionState::Create;
         }
         break;
       case 2:  // CREATE_ALWAYS
                // Overwrite existing, if any.
         if (content_manager->ContentExists(content_data)) {
           content_manager->DeleteContent(content_data);
-          create = true;
-        } else {
-          create = true;
         }
+        disposition = kDispositionState::Create;
         break;
       case 3:  // OPEN_EXISTING
                // Open only if exists.
         if (!content_manager->ContentExists(content_data)) {
           result = X_ERROR_PATH_NOT_FOUND;
         } else {
-          open = true;
+          disposition = kDispositionState::Open;
         }
         break;
       case 4:  // OPEN_ALWAYS
                // Create if needed.
         if (!content_manager->ContentExists(content_data)) {
-          create = true;
+          disposition = kDispositionState::Create;
         } else {
-          open = true;
+          disposition = kDispositionState::Open;
         }
         break;
       case 5:  // TRUNCATE_EXISTING
@@ -188,7 +187,7 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
           result = X_ERROR_PATH_NOT_FOUND;
         } else {
           content_manager->DeleteContent(content_data);
-          create = true;
+          disposition = kDispositionState::Create;
         }
         break;
       default:
@@ -196,21 +195,12 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
         break;
     }
 
-    // creation result
-    // 0 = ?
-    // 1 = created
-    // 2 = opened
-    uint32_t disposition = create ? 1 : 2;
-    if (disposition_ptr) {
-      *disposition_ptr = disposition;
-    }
-
-    if (create) {
+    if (disposition == kDispositionState::Create) {
       result = content_manager->CreateContent(root_name, content_data);
       if (XSUCCEEDED(result)) {
         content_manager->WriteContentHeaderFile(&content_data);
       }
-    } else if (open) {
+    } else if (disposition == kDispositionState::Open) {
       result = content_manager->OpenContent(root_name, content_data);
     }
 
@@ -224,12 +214,11 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
     }
 
     extended_error = X_HRESULT_FROM_WIN32(result);
-    length = disposition;
+    length = static_cast<uint32_t>(disposition);
 
     if (result && overlapped_ptr) {
       result = X_ERROR_FUNCTION_FAILED;
     }
-
     return result;
   };
 
@@ -451,7 +440,6 @@ static_assert_size(X_SWAPDISC_ERROR_MESSAGE, 12);
 dword_result_t XamSwapDisc_entry(
     dword_t disc_number, pointer_t<X_KEVENT> completion_handle,
     pointer_t<X_SWAPDISC_ERROR_MESSAGE> error_message) {
-
   xex2_opt_execution_info* info = nullptr;
   kernel_state()->GetExecutableModule()->GetOptHeader(XEX_HEADER_EXECUTION_INFO,
                                                       &info);
diff --git a/src/xenia/kernel/xam/xam_info.cc b/src/xenia/kernel/xam/xam_info.cc
index e3230aea7..d1a240312 100644
--- a/src/xenia/kernel/xam/xam_info.cc
+++ b/src/xenia/kernel/xam/xam_info.cc
@@ -254,202 +254,15 @@ dword_result_t XGetLanguage_entry() {
 }
 DECLARE_XAM_EXPORT1(XGetLanguage, kNone, kImplemented);
 
-// http://www.noxa.org/blog/2011/02/28/building-an-xbox-360-emulator-part-3-feasibilityos/
-// http://www.noxa.org/blog/2011/08/13/building-an-xbox-360-emulator-part-5-xex-files/
-dword_result_t RtlSleep_entry(dword_t dwMilliseconds, dword_t bAlertable) {
-  LARGE_INTEGER delay{};
-
-  // Convert the delay time to 100-nanosecond intervals
-  delay.QuadPart = dwMilliseconds == -1
-                       ? LLONG_MAX
-                       : static_cast<LONGLONG>(-10000) * dwMilliseconds;
-
-  X_STATUS result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
-                                                     (uint64_t*)&delay);
-
-  // If the delay was interrupted by an APC, keep delaying the thread
-  while (bAlertable && result == X_STATUS_ALERTED) {
-    result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
-                                              (uint64_t*)&delay);
-  }
-
-  return result == X_STATUS_SUCCESS ? X_STATUS_SUCCESS : X_STATUS_USER_APC;
-}
-DECLARE_XAM_EXPORT1(RtlSleep, kNone, kImplemented);
-
-dword_result_t SleepEx_entry(dword_t dwMilliseconds, dword_t bAlertable) {
-  return RtlSleep_entry(dwMilliseconds, bAlertable);
-}
-DECLARE_XAM_EXPORT1(SleepEx, kNone, kImplemented);
-
-// https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-sleep
-void Sleep_entry(dword_t dwMilliseconds) {
-  RtlSleep_entry(dwMilliseconds, FALSE);
-}
-DECLARE_XAM_EXPORT1(Sleep, kNone, kImplemented);
-
-// https://learn.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-gettickcount
-dword_result_t GetTickCount_entry() { return Clock::QueryGuestUptimeMillis(); }
-DECLARE_XAM_EXPORT1(GetTickCount, kNone, kImplemented);
-
 dword_result_t XamGetCurrentTitleId_entry() {
   return kernel_state()->emulator()->title_id();
 }
 DECLARE_XAM_EXPORT1(XamGetCurrentTitleId, kNone, kImplemented);
 
-dword_result_t RtlSetLastNTError_entry(dword_t error_code) {
-  const uint32_t result =
-      xe::kernel::xboxkrnl::xeRtlNtStatusToDosError(error_code);
-  XThread::SetLastError(result);
-
-  return result;
+dword_result_t XamIsCurrentTitleDash_entry(const ppc_context_t& ctx) {
+  return ctx->kernel_state->title_id() == 0xFFFE07D1;
 }
-DECLARE_XAM_EXPORT1(RtlSetLastNTError, kNone, kImplemented);
-
-dword_result_t RtlGetLastError_entry() { return XThread::GetLastError(); }
-DECLARE_XAM_EXPORT1(RtlGetLastError, kNone, kImplemented);
-
-dword_result_t GetLastError_entry() { return RtlGetLastError_entry(); }
-DECLARE_XAM_EXPORT1(GetLastError, kNone, kImplemented);
-
-dword_result_t GetModuleHandleA_entry(lpstring_t module_name) {
-  xe::be<uint32_t> module_ptr = 0;
-  const X_STATUS error_code = xe::kernel::xboxkrnl::XexGetModuleHandle(
-      module_name.value(), &module_ptr);
-
-  if (XFAILED(error_code)) {
-    RtlSetLastNTError_entry(error_code);
-
-    return NULL;
-  }
-
-  return (uint32_t)module_ptr;
-}
-DECLARE_XAM_EXPORT1(GetModuleHandleA, kNone, kImplemented);
-
-dword_result_t XapipCreateThread_entry(lpdword_t lpThreadAttributes,
-                                       dword_t dwStackSize,
-                                       lpvoid_t lpStartAddress,
-                                       lpvoid_t lpParameter,
-                                       dword_t dwCreationFlags, dword_t unkn,
-                                       lpdword_t lpThreadId) {
-  uint32_t flags = (dwCreationFlags >> 2) & 1;
-
-  if (unkn != -1) {
-    flags |= 1 << unkn << 24;
-  }
-
-  xe::be<uint32_t> result = 0;
-
-  const X_STATUS error_code = xe::kernel::xboxkrnl::ExCreateThread(
-      &result, dwStackSize, lpThreadId, lpStartAddress, lpParameter, 0, flags);
-
-  if (XFAILED(error_code)) {
-    RtlSetLastNTError_entry(error_code);
-
-    return NULL;
-  }
-
-  return (uint32_t)result;
-}
-DECLARE_XAM_EXPORT1(XapipCreateThread, kNone, kImplemented);
-
-dword_result_t CreateThread_entry(lpdword_t lpThreadAttributes,
-                                  dword_t dwStackSize, lpvoid_t lpStartAddress,
-                                  lpvoid_t lpParameter, dword_t dwCreationFlags,
-                                  lpdword_t lpThreadId) {
-  return XapipCreateThread_entry(lpThreadAttributes, dwStackSize,
-                                 lpStartAddress, lpParameter, dwCreationFlags,
-                                 -1, lpThreadId);
-}
-DECLARE_XAM_EXPORT1(CreateThread, kNone, kImplemented);
-
-dword_result_t CloseHandle_entry(dword_t hObject) {
-  const X_STATUS error_code = xe::kernel::xboxkrnl::NtClose(hObject);
-
-  if (XFAILED(error_code)) {
-    RtlSetLastNTError_entry(error_code);
-
-    return false;
-  }
-
-  return true;
-}
-DECLARE_XAM_EXPORT1(CloseHandle, kNone, kImplemented);
-
-dword_result_t ResumeThread_entry(dword_t hThread) {
-  uint32_t suspend_count;
-  const X_STATUS error_code =
-      xe::kernel::xboxkrnl::NtResumeThread(hThread, &suspend_count);
-
-  if (XFAILED(error_code)) {
-    RtlSetLastNTError_entry(error_code);
-
-    return -1;
-  }
-
-  return suspend_count;
-}
-DECLARE_XAM_EXPORT1(ResumeThread, kNone, kImplemented);
-
-void ExitThread_entry(dword_t exit_code) {
-  xe::kernel::xboxkrnl::ExTerminateThread(exit_code);
-}
-DECLARE_XAM_EXPORT1(ExitThread, kNone, kImplemented);
-
-dword_result_t GetCurrentThreadId_entry() {
-  return XThread::GetCurrentThread()->GetCurrentThreadId();
-}
-DECLARE_XAM_EXPORT1(GetCurrentThreadId, kNone, kImplemented);
-
-qword_result_t XapiFormatTimeOut_entry(lpqword_t result,
-                                       dword_t dwMilliseconds) {
-  LARGE_INTEGER delay{};
-
-  // Convert the delay time to 100-nanosecond intervals
-  delay.QuadPart =
-      dwMilliseconds == -1 ? 0 : static_cast<LONGLONG>(-10000) * dwMilliseconds;
-
-  return (uint64_t)&delay;
-}
-DECLARE_XAM_EXPORT1(XapiFormatTimeOut, kNone, kImplemented);
-
-dword_result_t WaitForSingleObjectEx_entry(dword_t hHandle,
-                                           dword_t dwMilliseconds,
-                                           dword_t bAlertable) {
-  uint64_t* timeout = nullptr;
-  uint64_t timeout_ptr = XapiFormatTimeOut_entry(timeout, dwMilliseconds);
-
-  X_STATUS result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
-      hHandle, 1, bAlertable, &timeout_ptr);
-
-  while (bAlertable && result == X_STATUS_ALERTED) {
-    result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
-        hHandle, 1, bAlertable, &timeout_ptr);
-  }
-
-  RtlSetLastNTError_entry(result);
-  result = -1;
-
-  return result;
-}
-DECLARE_XAM_EXPORT1(WaitForSingleObjectEx, kNone, kImplemented);
-
-dword_result_t WaitForSingleObject_entry(dword_t hHandle,
-                                         dword_t dwMilliseconds) {
-  return WaitForSingleObjectEx_entry(hHandle, dwMilliseconds, 0);
-}
-DECLARE_XAM_EXPORT1(WaitForSingleObject, kNone, kImplemented);
-
-dword_result_t lstrlenW_entry(lpu16string_t string) {
-  // wcslen?
-  if (string) {
-    return (uint32_t)string.value().length();
-  }
-
-  return NULL;
-}
-DECLARE_XAM_EXPORT1(lstrlenW, kNone, kImplemented);
+DECLARE_XAM_EXPORT1(XamIsCurrentTitleDash, kNone, kImplemented);
 
 dword_result_t XamGetExecutionId_entry(lpdword_t info_ptr) {
   auto module = kernel_state()->GetExecutableModule();
@@ -611,16 +424,204 @@ dword_result_t XamQueryLiveHiveW_entry(lpu16string_t name, lpvoid_t out_buf,
 }
 DECLARE_XAM_EXPORT1(XamQueryLiveHiveW, kNone, kStub);
 
-dword_result_t XamIsCurrentTitleDash_entry(const ppc_context_t& ctx) {
-  return ctx->kernel_state->title_id() == 0xFFFE07D1;
+// http://www.noxa.org/blog/2011/02/28/building-an-xbox-360-emulator-part-3-feasibilityos/
+// http://www.noxa.org/blog/2011/08/13/building-an-xbox-360-emulator-part-5-xex-files/
+dword_result_t RtlSleep_entry(dword_t dwMilliseconds, dword_t bAlertable) {
+  LARGE_INTEGER delay{};
+
+  // Convert the delay time to 100-nanosecond intervals
+  delay.QuadPart = dwMilliseconds == -1
+                       ? LLONG_MAX
+                       : static_cast<LONGLONG>(-10000) * dwMilliseconds;
+
+  X_STATUS result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
+                                                     (uint64_t*)&delay);
+
+  // If the delay was interrupted by an APC, keep delaying the thread
+  while (bAlertable && result == X_STATUS_ALERTED) {
+    result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
+                                              (uint64_t*)&delay);
+  }
+
+  return result == X_STATUS_SUCCESS ? X_STATUS_SUCCESS : X_STATUS_USER_APC;
 }
-DECLARE_XAM_EXPORT1(XamIsCurrentTitleDash, kNone, kImplemented);
+DECLARE_XAM_EXPORT1(RtlSleep, kNone, kImplemented);
+
+dword_result_t SleepEx_entry(dword_t dwMilliseconds, dword_t bAlertable) {
+  return RtlSleep_entry(dwMilliseconds, bAlertable);
+}
+DECLARE_XAM_EXPORT1(SleepEx, kNone, kImplemented);
+
+// https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-sleep
+void Sleep_entry(dword_t dwMilliseconds) {
+  RtlSleep_entry(dwMilliseconds, FALSE);
+}
+DECLARE_XAM_EXPORT1(Sleep, kNone, kImplemented);
+
+// https://learn.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-gettickcount
+dword_result_t GetTickCount_entry() { return Clock::QueryGuestUptimeMillis(); }
+DECLARE_XAM_EXPORT1(GetTickCount, kNone, kImplemented);
+
+dword_result_t RtlSetLastNTError_entry(dword_t error_code) {
+  const uint32_t result =
+      xe::kernel::xboxkrnl::xeRtlNtStatusToDosError(error_code);
+  XThread::SetLastError(result);
+
+  return result;
+}
+DECLARE_XAM_EXPORT1(RtlSetLastNTError, kNone, kImplemented);
+
+dword_result_t RtlGetLastError_entry() { return XThread::GetLastError(); }
+DECLARE_XAM_EXPORT1(RtlGetLastError, kNone, kImplemented);
+
+dword_result_t GetLastError_entry() { return RtlGetLastError_entry(); }
+DECLARE_XAM_EXPORT1(GetLastError, kNone, kImplemented);
+
+dword_result_t GetModuleHandleA_entry(lpstring_t module_name) {
+  xe::be<uint32_t> module_ptr = 0;
+  const X_STATUS error_code = xe::kernel::xboxkrnl::XexGetModuleHandle(
+      module_name.value(), &module_ptr);
+
+  if (XFAILED(error_code)) {
+    RtlSetLastNTError_entry(error_code);
+
+    return NULL;
+  }
+
+  return (uint32_t)module_ptr;
+}
+DECLARE_XAM_EXPORT1(GetModuleHandleA, kNone, kImplemented);
+
+dword_result_t XapipCreateThread_entry(lpdword_t lpThreadAttributes,
+                                       dword_t dwStackSize,
+                                       lpvoid_t lpStartAddress,
+                                       lpvoid_t lpParameter,
+                                       dword_t dwCreationFlags, dword_t unkn,
+                                       lpdword_t lpThreadId) {
+  uint32_t flags = (dwCreationFlags >> 2) & 1;
+
+  if (unkn != -1) {
+    flags |= 1 << unkn << 24;
+  }
+
+  xe::be<uint32_t> result = 0;
+
+  const X_STATUS error_code = xe::kernel::xboxkrnl::ExCreateThread(
+      &result, dwStackSize, lpThreadId, lpStartAddress, lpParameter, 0, flags);
+
+  if (XFAILED(error_code)) {
+    RtlSetLastNTError_entry(error_code);
+
+    return NULL;
+  }
+
+  return (uint32_t)result;
+}
+DECLARE_XAM_EXPORT1(XapipCreateThread, kNone, kImplemented);
+
+dword_result_t CreateThread_entry(lpdword_t lpThreadAttributes,
+                                  dword_t dwStackSize, lpvoid_t lpStartAddress,
+                                  lpvoid_t lpParameter, dword_t dwCreationFlags,
+                                  lpdword_t lpThreadId) {
+  return XapipCreateThread_entry(lpThreadAttributes, dwStackSize,
+                                 lpStartAddress, lpParameter, dwCreationFlags,
+                                 -1, lpThreadId);
+}
+DECLARE_XAM_EXPORT1(CreateThread, kNone, kImplemented);
+
+dword_result_t CloseHandle_entry(dword_t hObject) {
+  const X_STATUS error_code = xe::kernel::xboxkrnl::NtClose(hObject);
+
+  if (XFAILED(error_code)) {
+    RtlSetLastNTError_entry(error_code);
+
+    return false;
+  }
+
+  return true;
+}
+DECLARE_XAM_EXPORT1(CloseHandle, kNone, kImplemented);
+
+dword_result_t ResumeThread_entry(dword_t hThread) {
+  uint32_t suspend_count;
+  const X_STATUS error_code =
+      xe::kernel::xboxkrnl::NtResumeThread(hThread, &suspend_count);
+
+  if (XFAILED(error_code)) {
+    RtlSetLastNTError_entry(error_code);
+
+    return -1;
+  }
+
+  return suspend_count;
+}
+DECLARE_XAM_EXPORT1(ResumeThread, kNone, kImplemented);
+
+void ExitThread_entry(dword_t exit_code) {
+  xe::kernel::xboxkrnl::ExTerminateThread(exit_code);
+}
+DECLARE_XAM_EXPORT1(ExitThread, kNone, kImplemented);
+
+dword_result_t GetCurrentThreadId_entry() {
+  return XThread::GetCurrentThread()->GetCurrentThreadId();
+}
+DECLARE_XAM_EXPORT1(GetCurrentThreadId, kNone, kImplemented);
+
+qword_result_t XapiFormatTimeOut_entry(lpqword_t result,
+                                       dword_t dwMilliseconds) {
+  LARGE_INTEGER delay{};
+
+  // Convert the delay time to 100-nanosecond intervals
+  delay.QuadPart =
+      dwMilliseconds == -1 ? 0 : static_cast<LONGLONG>(-10000) * dwMilliseconds;
+
+  return (uint64_t)&delay;
+}
+DECLARE_XAM_EXPORT1(XapiFormatTimeOut, kNone, kImplemented);
+
+dword_result_t WaitForSingleObjectEx_entry(dword_t hHandle,
+                                           dword_t dwMilliseconds,
+                                           dword_t bAlertable) {
+  uint64_t* timeout = nullptr;
+  uint64_t timeout_ptr = XapiFormatTimeOut_entry(timeout, dwMilliseconds);
+
+  X_STATUS result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
+      hHandle, 1, bAlertable, &timeout_ptr);
+
+  while (bAlertable && result == X_STATUS_ALERTED) {
+    result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
+        hHandle, 1, bAlertable, &timeout_ptr);
+  }
+
+  RtlSetLastNTError_entry(result);
+  result = -1;
+
+  return result;
+}
+DECLARE_XAM_EXPORT1(WaitForSingleObjectEx, kNone, kImplemented);
+
+dword_result_t WaitForSingleObject_entry(dword_t hHandle,
+                                         dword_t dwMilliseconds) {
+  return WaitForSingleObjectEx_entry(hHandle, dwMilliseconds, 0);
+}
+DECLARE_XAM_EXPORT1(WaitForSingleObject, kNone, kImplemented);
+
+dword_result_t lstrlenW_entry(lpu16string_t string) {
+  // wcslen?
+  if (string) {
+    return (uint32_t)string.value().length();
+  }
+
+  return NULL;
+}
+DECLARE_XAM_EXPORT1(lstrlenW, kNone, kImplemented);
 
 dword_result_t XGetAudioFlags_entry() { return 65537; }
 DECLARE_XAM_EXPORT1(XGetAudioFlags, kNone, kStub);
 
 /*
-	todo: this table should instead be pointed to by a member of kernel state and initialized along with the process
+        todo: this table should instead be pointed to by a member of kernel
+   state and initialized along with the process
 */
 static int32_t XamRtlRandomTable[128] = {
     1284227242, 1275210071, 573735546,  790525478,  2139871995, 1547161642,