Merge branch 'master' of https://github.com/xenia-project/xenia into canary_experimental

2023-09-01 18:20:29 +02:00 · 2023-09-01 18:20:29 +02:00 · ce9a82ccf8
parent dc29307a55 c5e6352c34
commit ce9a82ccf8
24 changed files with 1771 additions and 1184 deletions
--- a/src/xenia/base/filesystem_posix.cc
+++ b/src/xenia/base/filesystem_posix.cc
@ -217,6 +217,10 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
  }

  while (auto ent = readdir(dir)) {
+    if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
+      continue;
+    }
+
    FileInfo info;

    info.name = ent->d_name;
@ -225,6 +229,7 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
    info.create_timestamp = convertUnixtimeToWinFiletime(st.st_ctime);
    info.access_timestamp = convertUnixtimeToWinFiletime(st.st_atime);
    info.write_timestamp = convertUnixtimeToWinFiletime(st.st_mtime);
+    info.path = path;
    if (ent->d_type == DT_DIR) {
      info.type = FileInfo::Type::kDirectory;
      info.total_size = 0;
@ -234,7 +239,7 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
    }
    result.push_back(info);
  }
-
+  closedir(dir);
  return result;
 }

--- a/src/xenia/base/utf8.cc
+++ b/src/xenia/base/utf8.cc
@ -10,6 +10,7 @@
 #include "xenia/base/utf8.h"

 #include <algorithm>
+#include <cstdint>
 #include <locale>
 #include <numeric>
 #include <tuple>
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -481,6 +481,43 @@ struct VECTOR_COMPARE_UGT_V128
    : Sequence<VECTOR_COMPARE_UGT_V128,
               I<OPCODE_VECTOR_COMPARE_UGT, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW |
+                           kX64EmitAVX512DQ) &&
+        (i.instr->flags != FLOAT32_TYPE)) {
+      Xmm src1 = e.xmm0;
+      if (i.src1.is_constant) {
+        e.LoadConstantXmm(src1, i.src1.constant());
+      } else {
+        src1 = i.src1;
+      }
+
+      Xmm src2 = e.xmm1;
+      if (i.src2.is_constant) {
+        e.LoadConstantXmm(src2, i.src2.constant());
+      } else {
+        src2 = i.src2;
+      }
+
+      switch (i.instr->flags) {
+        case INT8_TYPE:
+          e.vpcmpub(e.k1, src1, src2, 0x6);
+          e.vpmovm2b(i.dest, e.k1);
+          break;
+        case INT16_TYPE:
+          e.vpcmpuw(e.k1, src1, src2, 0x6);
+          e.vpmovm2w(i.dest, e.k1);
+          break;
+        case INT32_TYPE:
+          e.vpcmpud(e.k1, src1, src2, 0x6);
+          e.vpmovm2d(i.dest, e.k1);
+          break;
+        default:
+          assert_always();
+          break;
+      }
+      return;
+    }
+
    Xbyak::Address sign_addr = e.ptr[e.rax];  // dummy
    switch (i.instr->flags) {
      case INT8_TYPE:
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@ -646,8 +646,9 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
          break;
        case OPCODE_AND_NOT:
          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-            v->set_from(i->src1.value);
-            v->AndNot(i->src2.value);
+            v->set_from(i->src2.value);
+            v->Not();
+            v->And(i->src1.value);
            i->UnlinkAndNOP();
            result = true;
          }
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@ -324,8 +324,13 @@ int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
 }

 int InstrEmit_vaddcuw(PPCHIRBuilder& f, const InstrData& i) {
-  XEINSTRNOTIMPLEMENTED();
-  return 1;
+  Value* sum = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE,
+                           ARITHMETIC_UNSIGNED);
+  Value* overflow = f.VectorCompareUGT(f.LoadVR(i.VX.VA), sum, INT32_TYPE);
+  Value* carry =
+      f.VectorShr(overflow, f.LoadConstantVec128(vec128i(31)), INT32_TYPE);
+  f.StoreVR(i.VX.VD, carry);
+  return 0;
 }

 int InstrEmit_vaddfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb) {
@ -1665,7 +1670,11 @@ int InstrEmit_vsrw128(PPCHIRBuilder& f, const InstrData& i) {
 }

 int InstrEmit_vsubcuw(PPCHIRBuilder& f, const InstrData& i) {
-  XEINSTRNOTIMPLEMENTED();
+  Value* underflow =
+      f.VectorCompareUGE(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE);
+  Value* borrow =
+      f.VectorShr(underflow, f.LoadConstantVec128(vec128i(31)), INT32_TYPE);
+  f.StoreVR(i.VX.VD, borrow);
  return 1;
 }

--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -2574,7 +2574,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
    return false;
  }
  pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
-  const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
+
+  const bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0;

  // Pixel shader analysis.
  bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
@ -2604,7 +2605,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
  }

  const bool memexport_used_pixel =
-      pixel_shader && pixel_shader->is_valid_memexport_used();
+      pixel_shader && (pixel_shader->memexport_eM_written() != 0);
  const bool memexport_used = memexport_used_vertex || memexport_used_pixel;

  if (!BeginSubmission(true)) {
@ -2831,12 +2832,22 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
  // Gather memexport ranges and ensure the heaps for them are resident, and
  // also load the data surrounding the export and to fill the regions that
  // won't be modified by the shaders.
-
-  memexport_range_count_ = 0;
-  if (memexport_used_vertex || memexport_used_pixel) {
-    bool retflag;
-    bool retval = GatherMemexportRangesAndMakeResident(retflag);
-    if (retflag) return retval;
+  memexport_ranges_.clear();
+  if (memexport_used_vertex) {
+    draw_util::AddMemExportRanges(regs, *vertex_shader, memexport_ranges_);
+  }
+  if (memexport_used_pixel) {
+    draw_util::AddMemExportRanges(regs, *pixel_shader, memexport_ranges_);
+  }
+  for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
+    if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
+                                      memexport_range.size_bytes)) {
+      XELOGE(
+          "Failed to request memexport stream at 0x{:08X} (size {}) in the "
+          "shared memory",
+          memexport_range.base_address_dwords << 2, memexport_range.size_bytes);
+      return false;
+    }
  }
  // Primitive topology.
  D3D_PRIMITIVE_TOPOLOGY primitive_topology;
@ -2935,11 +2946,22 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
          // If the shared memory is a UAV, it can't be used as an index buffer
          // (UAV is a read/write state, index buffer is a read-only state).
          // Need to copy the indices to a buffer in the index buffer state.
-          bool retflag;
-          bool retval = HandleMemexportGuestDMA(
-              scratch_index_buffer, index_buffer_view,
-              primitive_processing_result.guest_index_base, retflag);
-          if (retflag) return retval;
+          scratch_index_buffer = RequestScratchGPUBuffer(
+              index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST);
+          if (scratch_index_buffer == nullptr) {
+            return false;
+          }
+          shared_memory_->UseAsCopySource();
+          SubmitBarriers();
+          deferred_command_list_.D3DCopyBufferRegion(
+              scratch_index_buffer, 0, shared_memory_->GetBuffer(),
+              primitive_processing_result.guest_index_base,
+              index_buffer_view.SizeInBytes);
+          PushTransitionBarrier(scratch_index_buffer,
+                                D3D12_RESOURCE_STATE_COPY_DEST,
+                                D3D12_RESOURCE_STATE_INDEX_BUFFER);
+          index_buffer_view.BufferLocation =
+              scratch_index_buffer->GetGPUVirtualAddress();
        } else {
          index_buffer_view.BufferLocation =
              shared_memory_->GetGPUAddress() +
@ -2977,159 +2999,23 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
  }

  if (memexport_used) {
-    HandleMemexportDrawOrdering_AndReadback();
-  }
-
-  return true;
-}
-XE_COLD
-XE_NOINLINE
-bool D3D12CommandProcessor::HandleMemexportGuestDMA(
-    ID3D12Resource*& scratch_index_buffer,
-    D3D12_INDEX_BUFFER_VIEW& index_buffer_view, uint32_t guest_index_base,
-    // xe::gpu::PrimitiveProcessor::ProcessingResult&
-    // primitive_processing_result,
-    bool& retflag) {
-  retflag = true;
-  scratch_index_buffer = RequestScratchGPUBuffer(
-      index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST);
-  if (scratch_index_buffer == nullptr) {
-    return false;
-  }
-  shared_memory_->UseAsCopySource();
-  SubmitBarriers();
-  deferred_command_list_.D3DCopyBufferRegion(
-      scratch_index_buffer, 0, shared_memory_->GetBuffer(), guest_index_base,
-      index_buffer_view.SizeInBytes);
-  PushTransitionBarrier(scratch_index_buffer, D3D12_RESOURCE_STATE_COPY_DEST,
-                        D3D12_RESOURCE_STATE_INDEX_BUFFER);
-  index_buffer_view.BufferLocation =
-      scratch_index_buffer->GetGPUVirtualAddress();
-  retflag = false;
-  return {};
-}
-XE_NOINLINE
-XE_COLD
-bool D3D12CommandProcessor::GatherMemexportRangesAndMakeResident(
-    bool& retflag) {
-  auto vertex_shader = static_cast<D3D12Shader*>(active_vertex_shader());
-  auto pixel_shader = static_cast<D3D12Shader*>(active_pixel_shader());
-  const xe::gpu::RegisterFile& regs = *register_file_;
-  const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
-  const bool memexport_used_pixel =
-      pixel_shader && pixel_shader->is_valid_memexport_used();
-  retflag = true;
-  if (memexport_used_vertex) {
-    for (uint32_t constant_index :
-         vertex_shader->memexport_stream_constants()) {
-      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
-          XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4);
-      if (memexport_stream.index_count == 0) {
-        continue;
-      }
-      uint32_t memexport_format_size =
-          GetSupportedMemExportFormatSize(memexport_stream.format);
-      if (memexport_format_size == 0) {
-        XELOGE("Unsupported memexport format {}",
-               FormatInfo::GetName(
-                   xenos::TextureFormat(uint32_t(memexport_stream.format))));
-        return false;
-      }
-      uint32_t memexport_size_dwords =
-          memexport_stream.index_count * memexport_format_size;
-      // Try to reduce the number of shared memory operations when writing
-      // different elements into the same buffer through different exports
-      // (happens in 4D5307E6).
-      bool memexport_range_reused = false;
-      for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-        MemExportRange& memexport_range = memexport_ranges_[i];
-        if (memexport_range.base_address_dwords ==
-            memexport_stream.base_address) {
-          memexport_range.size_dwords =
-              std::max(memexport_range.size_dwords, memexport_size_dwords);
-          memexport_range_reused = true;
-          break;
-        }
-      }
-      // Add a new range if haven't expanded an existing one.
-      if (!memexport_range_reused) {
-        MemExportRange& memexport_range =
-            memexport_ranges_[memexport_range_count_++];
-        memexport_range.base_address_dwords = memexport_stream.base_address;
-        memexport_range.size_dwords = memexport_size_dwords;
-      }
-    }
-  }
-  if (memexport_used_pixel) {
-    for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) {
-      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
-          XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4);
-      if (memexport_stream.index_count == 0) {
-        continue;
-      }
-      uint32_t memexport_format_size =
-          GetSupportedMemExportFormatSize(memexport_stream.format);
-      if (memexport_format_size == 0) {
-        XELOGE("Unsupported memexport format {}",
-               FormatInfo::GetName(
-                   xenos::TextureFormat(uint32_t(memexport_stream.format))));
-        return false;
-      }
-      uint32_t memexport_size_dwords =
-          memexport_stream.index_count * memexport_format_size;
-      bool memexport_range_reused = false;
-      for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-        MemExportRange& memexport_range = memexport_ranges_[i];
-        if (memexport_range.base_address_dwords ==
-            memexport_stream.base_address) {
-          memexport_range.size_dwords =
-              std::max(memexport_range.size_dwords, memexport_size_dwords);
-          memexport_range_reused = true;
-          break;
-        }
-      }
-      if (!memexport_range_reused) {
-        MemExportRange& memexport_range =
-            memexport_ranges_[memexport_range_count_++];
-        memexport_range.base_address_dwords = memexport_stream.base_address;
-        memexport_range.size_dwords = memexport_size_dwords;
-      }
-    }
-  }
-  for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-    const MemExportRange& memexport_range = memexport_ranges_[i];
-    if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
-                                      memexport_range.size_dwords << 2)) {
-      XELOGE(
-          "Failed to request memexport stream at 0x{:08X} (size {}) in the "
-          "shared memory",
-          memexport_range.base_address_dwords << 2,
-          memexport_range.size_dwords << 2);
-      return false;
-    }
-  }
-  retflag = false;
-  return {};
-}
-XE_NOINLINE
-XE_COLD
-void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() {
    // Make sure this memexporting draw is ordered with other work using shared
    // memory as a UAV.
    // TODO(Triang3l): Find some PM4 command that can be used for indication of
    // when memexports should be awaited?
    shared_memory_->MarkUAVWritesCommitNeeded();
    // Invalidate textures in memexported memory and watch for changes.
-  for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-    const MemExportRange& memexport_range = memexport_ranges_[i];
-    shared_memory_->RangeWrittenByGpu(memexport_range.base_address_dwords << 2,
-                                      memexport_range.size_dwords << 2, false);
+    for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
+      shared_memory_->RangeWrittenByGpu(
+          memexport_range.base_address_dwords << 2, memexport_range.size_bytes,
+          false);
    }
    if (cvars::d3d12_readback_memexport) {
      // Read the exported data on the CPU.
      uint32_t memexport_total_size = 0;
-    for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-      memexport_total_size += memexport_ranges_[i].size_dwords << 2;
+      for (const draw_util::MemExportRange& memexport_range :
+           memexport_ranges_) {
+        memexport_total_size += memexport_range.size_bytes;
      }
      if (memexport_total_size != 0) {
        ID3D12Resource* readback_buffer =
@ -3139,9 +3025,9 @@ void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() {
          SubmitBarriers();
          ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
          uint32_t readback_buffer_offset = 0;
-        for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-          const MemExportRange& memexport_range = memexport_ranges_[i];
-          uint32_t memexport_range_size = memexport_range.size_dwords << 2;
+          for (const draw_util::MemExportRange& memexport_range :
+               memexport_ranges_) {
+            uint32_t memexport_range_size = memexport_range.size_bytes;
            deferred_command_list_.D3DCopyBufferRegion(
                readback_buffer, readback_buffer_offset, shared_memory_buffer,
                memexport_range.base_address_dwords << 2, memexport_range_size);
@ -3154,14 +3040,14 @@ void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() {
            void* readback_mapping;
            if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
                                               &readback_mapping))) {
-            const uint32_t* readback_dwords =
-                reinterpret_cast<const uint32_t*>(readback_mapping);
-            for (uint32_t i = 0; i < memexport_range_count_; ++i) {
-              const MemExportRange& memexport_range = memexport_ranges_[i];
+              const uint8_t* readback_bytes =
+                  reinterpret_cast<const uint8_t*>(readback_mapping);
+              for (const draw_util::MemExportRange& memexport_range :
+                   memexport_ranges_) {
                std::memcpy(memory_->TranslatePhysical(
                                memexport_range.base_address_dwords << 2),
-                          readback_dwords, memexport_range.size_dwords << 2);
-              readback_dwords += memexport_range.size_dwords;
+                            readback_bytes, memexport_range.size_bytes);
+                readback_bytes += memexport_range.size_bytes;
              }
              D3D12_RANGE readback_write_range = {};
              readback_buffer->Unmap(0, &readback_write_range);
@ -3170,6 +3056,9 @@ void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() {
        }
      }
    }
+  }
+
+  return true;
 }

 void D3D12CommandProcessor::InitializeTrace() {
@ -5208,36 +5097,6 @@ bool D3D12CommandProcessor::UpdateBindings_BindfulPath(
  return {};
 }

-uint32_t D3D12CommandProcessor::GetSupportedMemExportFormatSize(
-    xenos::ColorFormat format) {
-  switch (format) {
-    case xenos::ColorFormat::k_8_8_8_8:
-    case xenos::ColorFormat::k_2_10_10_10:
-    // TODO(Triang3l): Investigate how k_8_8_8_8_A works - not supported in the
-    // texture cache currently.
-    // case xenos::ColorFormat::k_8_8_8_8_A:
-    case xenos::ColorFormat::k_10_11_11:
-    case xenos::ColorFormat::k_11_11_10:
-    case xenos::ColorFormat::k_16_16:
-    case xenos::ColorFormat::k_16_16_FLOAT:
-    case xenos::ColorFormat::k_32_FLOAT:
-    case xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16:
-    case xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16:
-    case xenos::ColorFormat::k_10_11_11_AS_16_16_16_16:
-    case xenos::ColorFormat::k_11_11_10_AS_16_16_16_16:
-      return 1;
-    case xenos::ColorFormat::k_16_16_16_16:
-    case xenos::ColorFormat::k_16_16_16_16_FLOAT:
-    case xenos::ColorFormat::k_32_32_FLOAT:
-      return 2;
-    case xenos::ColorFormat::k_32_32_32_32_FLOAT:
-      return 4;
-    default:
-      break;
-  }
-  return 0;
-}
-
 ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
  if (size == 0) {
    return nullptr;
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <utility>
+#include <vector>

 #include "xenia/base/assert.h"
 #include "xenia/gpu/command_processor.h"
@ -319,18 +320,7 @@ class D3D12CommandProcessor final : public CommandProcessor {
  bool IssueDraw(xenos::PrimitiveType primitive_type, uint32_t index_count,
                 IndexBufferInfo* index_buffer_info,
                 bool major_mode_explicit) override;
-  XE_COLD
-  XE_NOINLINE
-  bool HandleMemexportGuestDMA(ID3D12Resource*& scratch_index_buffer,
-                               D3D12_INDEX_BUFFER_VIEW& index_buffer_view,
-                               uint32_t guest_index_base,
-                               bool& retflag);
-  XE_NOINLINE
-  XE_COLD
-  bool GatherMemexportRangesAndMakeResident(bool& retflag);
-  XE_NOINLINE
-  XE_COLD
-  void HandleMemexportDrawOrdering_AndReadback();
+
  bool IssueCopy() override;
  XE_NOINLINE
  bool IssueCopy_ReadbackResolvePath();
@ -502,13 +492,6 @@ class D3D12CommandProcessor final : public CommandProcessor {
      const size_t sampler_count_vertex, const size_t sampler_count_pixel,
      bool& retflag);

-  // Returns dword count for one element for a memexport format, or 0 if it's
-  // not supported by the D3D12 command processor (if it's smaller that 1 dword,
-  // for instance).
-  // TODO(Triang3l): Check if any game uses memexport with formats smaller than
-  // 32 bits per element.
-  static uint32_t GetSupportedMemExportFormatSize(xenos::ColorFormat format);
-
  // Returns a buffer for reading GPU data back to the CPU. Assuming
  // synchronizing immediately after use. Always in COPY_DEST state.
  ID3D12Resource* RequestReadbackBuffer(uint32_t size);
@ -811,12 +794,13 @@ class D3D12CommandProcessor final : public CommandProcessor {

  draw_util::GetViewportInfoArgs previous_viewport_info_args_;
  draw_util::ViewportInfo previous_viewport_info_;
-  // scratch memexport data
-  MemExportRange memexport_ranges_[512];
-  uint32_t memexport_range_count_ = 0;
+

  std::atomic<bool> pix_capture_requested_ = false;
  bool pix_capturing_;
+
+  // Temporary storage for memexport stream constants used in the draw.
+  std::vector<draw_util::MemExportRange> memexport_ranges_;
 };

 }  // namespace d3d12
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Copyright 2023 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -134,7 +134,7 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
  //
  // Memory export is an obvious intentional side effect.
  if (shader.kills_pixels() || shader.writes_depth() ||
-      shader.is_valid_memexport_used() ||
+      shader.memexport_eM_written() ||
      (shader.writes_color_target(0) &&
       DoesCoverageDependOnAlpha(regs.Get<reg::RB_COLORCONTROL>()))) {
    return true;
@ -765,8 +765,70 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
  }
  return normalized_color_mask;
 }
+
+void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
+                        std::vector<MemExportRange>& ranges_out) {
+  if (!shader.memexport_eM_written()) {
+    // The shader has eA writes, but no real exports.
+    return;
+  }
+  uint32_t float_constants_base = shader.type() == xenos::ShaderType::kVertex
+                                      ? regs.Get<reg::SQ_VS_CONST>().base
+                                      : regs.Get<reg::SQ_PS_CONST>().base;
+  for (uint32_t constant_index : shader.memexport_stream_constants()) {
+    const auto& stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
+        XE_GPU_REG_SHADER_CONSTANT_000_X +
+        (float_constants_base + constant_index) * 4);
+    if (!stream.index_count) {
+      continue;
+    }
+    const FormatInfo& format_info =
+        *FormatInfo::Get(xenos::TextureFormat(stream.format));
+    if (format_info.type != FormatType::kResolvable) {
+      XELOGE("Unsupported memexport format {}",
+             FormatInfo::GetName(format_info.format));
+      // Translated shaders shouldn't be performing exports with an unknown
+      // format, the draw can still be performed.
+      continue;
+    }
+    // TODO(Triang3l): Remove the unresearched format logging when it's known
+    // how exactly these formats need to be handled (most importantly what
+    // components need to be stored and in which order).
+    switch (stream.format) {
+      case xenos::ColorFormat::k_8_A:
+      case xenos::ColorFormat::k_8_B:
+      case xenos::ColorFormat::k_8_8_8_8_A:
+        XELOGW(
+            "Memexport done to an unresearched format {}, report the game to "
+            "Xenia developers!",
+            FormatInfo::GetName(format_info.format));
+        break;
+      default:
+        break;
+    }
+    uint32_t stream_size_bytes =
+        stream.index_count * (format_info.bits_per_pixel >> 3);
+    // Try to reduce the number of shared memory operations when writing
+    // different elements into the same buffer through different exports
+    // (happens in 4D5307E6).
+    bool range_reused = false;
+    for (MemExportRange& range : ranges_out) {
+      if (range.base_address_dwords == stream.base_address) {
+        range.size_bytes = std::max(range.size_bytes, stream_size_bytes);
+        range_reused = true;
+        break;
+      }
+    }
+    // Add a new range if haven't expanded an existing one.
+    if (!range_reused) {
+      ranges_out.emplace_back(stream.base_address, stream_size_bytes);
+    }
+  }
+}
+
 XE_NOINLINE
 XE_NOALIAS
+
 xenos::CopySampleSelect SanitizeCopySampleSelect(
    xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
    bool is_depth) {
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@ -13,6 +13,7 @@
 #include <cmath>
 #include <cstdint>
 #include <utility>
+#include <vector>

 #include "xenia/base/assert.h"
 #include "xenia/gpu/register_file.h"
@ -474,6 +475,19 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA(
  return guest_sample_index ? 3 : 0;
 }

+struct MemExportRange {
+  uint32_t base_address_dwords;
+  uint32_t size_bytes;
+
+  explicit MemExportRange(uint32_t base_address_dwords, uint32_t size_bytes)
+      : base_address_dwords(base_address_dwords), size_bytes(size_bytes) {}
+};
+
+// Gathers memory ranges involved in memexports in the shader with the float
+// constants from the registers, adding them to ranges_out.
+void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
+                        std::vector<MemExportRange>& ranges_out);
+
 // To avoid passing values that the shader won't understand (even though
 // Direct3D 9 shouldn't pass them anyway).
 XE_NOINLINE
--- a/src/xenia/gpu/dxbc.h
+++ b/src/xenia/gpu/dxbc.h
@ -913,6 +913,8 @@ enum class OperandModifier : uint32_t {

 struct Dest : OperandAddress {
  // Ignored for 0-component and 1-component operand types.
+  // For 4-component operand types, if the write mask is 0, it's treated as
+  // 0-component.
  uint32_t write_mask_;

  // Input destinations (v*) are for use only in declarations. Vector input
@ -1028,12 +1030,16 @@ struct Dest : OperandAddress {
  void Write(std::vector<uint32_t>& code, bool in_dcl = false) const {
    uint32_t operand_token = GetOperandTokenTypeAndIndex();
    OperandDimension dimension = GetDimension(in_dcl);
-    operand_token |= uint32_t(dimension);
    if (dimension == OperandDimension::kVector) {
-      assert_true(write_mask_ > 0b0000 && write_mask_ <= 0b1111);
+      if (write_mask_) {
+        assert_true(write_mask_ <= 0b1111);
        operand_token |=
            (uint32_t(ComponentSelection::kMask) << 2) | (write_mask_ << 4);
+      } else {
+        dimension = OperandDimension::kNoData;
      }
+    }
+    operand_token |= uint32_t(dimension);
    code.push_back(operand_token);
    OperandAddress::Write(code);
  }
@ -1508,6 +1514,8 @@ enum class Opcode : uint32_t {
  kStoreUAVTyped = 164,
  kLdRaw = 165,
  kStoreRaw = 166,
+  kAtomicAnd = 169,
+  kAtomicOr = 170,
  kEvalSampleIndex = 204,
  kEvalCentroid = 205,
 };
@ -2396,6 +2404,14 @@ class Assembler {
    ++stat_.instruction_count;
    ++stat_.c_texture_store_instructions;
  }
+  void OpAtomicAnd(const Dest& dest, const Src& address,
+                   uint32_t address_components, const Src& value) {
+    EmitAtomicOp(Opcode::kAtomicAnd, dest, address, address_components, value);
+  }
+  void OpAtomicOr(const Dest& dest, const Src& address,
+                  uint32_t address_components, const Src& value) {
+    EmitAtomicOp(Opcode::kAtomicOr, dest, address, address_components, value);
+  }
  void OpEvalSampleIndex(const Dest& dest, const Src& value,
                         const Src& sample_index) {
    uint32_t dest_write_mask = dest.GetMask();
@ -2522,6 +2538,22 @@ class Assembler {
    src1.Write(code_, true, 0b0000);
    ++stat_.instruction_count;
  }
+  void EmitAtomicOp(Opcode opcode, const Dest& dest, const Src& address,
+                    uint32_t address_components, const Src& value) {
+    // Atomic operations require a 0-component memory destination.
+    assert_zero(dest.GetMask());
+    uint32_t address_mask = (1 << address_components) - 1;
+    uint32_t operands_length = dest.GetLength() +
+                               address.GetLength(address_mask) +
+                               value.GetLength(0b0001);
+    code_.reserve(code_.size() + 1 + operands_length);
+    code_.push_back(OpcodeToken(opcode, operands_length));
+    dest.Write(code_);
+    address.Write(code_, true, address_mask);
+    value.Write(code_, true, 0b0001);
+    ++stat_.instruction_count;
+    ++stat_.c_interlocked_instructions;
+  }

  std::vector<uint32_t>& code_;
  Statistics& stat_;
--- a/src/xenia/gpu/dxbc_shader_translator.cc
+++ b/src/xenia/gpu/dxbc_shader_translator.cc
@ -179,8 +179,6 @@ void DxbcShaderTranslator::Reset() {

  sampler_bindings_.clear();

-  memexport_alloc_current_count_ = 0;
-
  std::memset(&shader_feature_info_, 0, sizeof(shader_feature_info_));
  std::memset(&statistics_, 0, sizeof(statistics_));
 }
@ -789,6 +787,63 @@ void DxbcShaderTranslator::StartPixelShader() {
      PopSystemTemp();
    }
  }
+
+  if (current_shader().memexport_eM_written()) {
+    // Make sure memexport is done only once for a guest pixel.
+    dxbc::Dest memexport_enabled_dest(
+        dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001));
+    dxbc::Src memexport_enabled_src(dxbc::Src::R(
+        system_temp_memexport_enabled_and_eM_written_, dxbc::Src::kXXXX));
+    uint32_t resolution_scaled_axes =
+        uint32_t(draw_resolution_scale_x_ > 1) |
+        (uint32_t(draw_resolution_scale_y_ > 1) << 1);
+    if (resolution_scaled_axes) {
+      uint32_t memexport_condition_temp = PushSystemTemp();
+      // Only do memexport for one host pixel in a guest pixel - prefer the
+      // host pixel closer to the center of the guest pixel, but one that's
+      // covered with the half-pixel offset according to the top-left rule (1
+      // for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x
+      // because it's the center and is covered with the half-pixel offset too).
+      in_position_used_ |= resolution_scaled_axes;
+      a_.OpFToU(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
+                dxbc::Src::V1D(in_reg_ps_position_));
+      a_.OpUDiv(dxbc::Dest::Null(),
+                dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
+                dxbc::Src::R(memexport_condition_temp),
+                dxbc::Src::LU(draw_resolution_scale_x_,
+                              draw_resolution_scale_y_, 0, 0));
+      a_.OpIEq(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
+               dxbc::Src::R(memexport_condition_temp),
+               dxbc::Src::LU(draw_resolution_scale_x_ >> 1,
+                             draw_resolution_scale_y_ >> 1, 0, 0));
+      for (uint32_t i = 0; i < 2; ++i) {
+        if (!(resolution_scaled_axes & (1 << i))) {
+          continue;
+        }
+        a_.OpAnd(memexport_enabled_dest, memexport_enabled_src,
+                 dxbc::Src::R(memexport_condition_temp).Select(i));
+      }
+      // Release memexport_condition_temp.
+      PopSystemTemp();
+    }
+    // With sample-rate shading (with float24 conversion), only do memexport
+    // from one sample (as the shader is invoked multiple times for a pixel),
+    // if SV_SampleIndex == firstbit_lo(SV_Coverage). For zero coverage,
+    // firstbit_lo returns 0xFFFFFFFF.
+    if (IsSampleRate()) {
+      uint32_t memexport_condition_temp = PushSystemTemp();
+      a_.OpFirstBitLo(dxbc::Dest::R(memexport_condition_temp, 0b0001),
+                      dxbc::Src::VCoverage());
+      a_.OpIEq(
+          dxbc::Dest::R(memexport_condition_temp, 0b0001),
+          dxbc::Src::V1D(in_reg_ps_front_face_sample_index_, dxbc::Src::kYYYY),
+          dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX));
+      a_.OpAnd(memexport_enabled_dest, memexport_enabled_src,
+               dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX));
+      // Release memexport_condition_temp.
+      PopSystemTemp();
+    }
+  }
 }

 void DxbcShaderTranslator::StartTranslation() {
@ -885,34 +940,27 @@ void DxbcShaderTranslator::StartTranslation() {
    }
  }

-  if (!is_depth_only_pixel_shader_) {
-    // Allocate temporary registers for memexport addresses and data.
-    std::memset(system_temps_memexport_address_, 0xFF,
-                sizeof(system_temps_memexport_address_));
-    std::memset(system_temps_memexport_data_, 0xFF,
-                sizeof(system_temps_memexport_data_));
-    system_temp_memexport_written_ = UINT32_MAX;
-    const uint8_t* memexports_written = current_shader().memexport_eM_written();
-    for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) {
-      uint32_t memexport_alloc_written = memexports_written[i];
-      if (memexport_alloc_written == 0) {
-        continue;
-      }
-      // If memexport is used at all, allocate a register containing whether eM#
-      // have actually been written to.
-      if (system_temp_memexport_written_ == UINT32_MAX) {
-        system_temp_memexport_written_ = PushSystemTemp(0b1111);
-      }
-      system_temps_memexport_address_[i] = PushSystemTemp(0b1111);
-      uint32_t memexport_data_index;
-      while (xe::bit_scan_forward(memexport_alloc_written,
-                                  &memexport_data_index)) {
-        memexport_alloc_written &= ~(1u << memexport_data_index);
-        system_temps_memexport_data_[i][memexport_data_index] =
-            PushSystemTemp();
+  // Allocate temporary registers for memexport.
+  uint8_t memexport_eM_written = current_shader().memexport_eM_written();
+  if (memexport_eM_written) {
+    system_temp_memexport_enabled_and_eM_written_ = PushSystemTemp(0b0010);
+    // Initialize the memexport conditional to whether the shared memory is
+    // currently bound as UAV (to 0 or UINT32_MAX). It can be made narrower
+    // later.
+    a_.OpIBFE(
+        dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001),
+        dxbc::Src::LU(1), dxbc::Src::LU(kSysFlag_SharedMemoryIsUAV_Shift),
+        LoadFlagsSystemConstant());
+    system_temp_memexport_address_ = PushSystemTemp(0b1111);
+    uint8_t memexport_eM_remaining = memexport_eM_written;
+    uint32_t memexport_eM_index;
+    while (xe::bit_scan_forward(memexport_eM_remaining, &memexport_eM_index)) {
+      memexport_eM_remaining &= ~(uint8_t(1) << memexport_eM_index);
+      system_temps_memexport_data_[memexport_eM_index] = PushSystemTemp(0b1111);
    }
  }

+  if (!is_depth_only_pixel_shader_) {
    // Allocate system temporary variables for the translated code. Since access
    // depends on the guest code (thus no guarantees), initialize everything
    // now (except for pv, it's an internal temporary variable, not accessible
@ -1091,27 +1139,19 @@ void DxbcShaderTranslator::CompleteShaderCode() {
    // - system_temp_grad_h_lod_.
    // - system_temp_grad_v_vfetch_address_.
    PopSystemTemp(6);
+  }

-    // Write memexported data to the shared memory UAV.
-    ExportToMemory();
+  uint8_t memexport_eM_written = current_shader().memexport_eM_written();
+  if (memexport_eM_written) {
+    // Write data for the last memexport.
+    ExportToMemory(
+        current_shader().memexport_eM_potentially_written_before_end());

-    // Release memexport temporary registers.
-    for (int i = Shader::kMaxMemExports - 1; i >= 0; --i) {
-      if (system_temps_memexport_address_[i] == UINT32_MAX) {
-        continue;
-      }
-      // Release exported data registers.
-      for (int j = 4; j >= 0; --j) {
-        if (system_temps_memexport_data_[i][j] != UINT32_MAX) {
-          PopSystemTemp();
-        }
-      }
-      // Release the address register.
-      PopSystemTemp();
-    }
-    if (system_temp_memexport_written_ != UINT32_MAX) {
-      PopSystemTemp();
-    }
+    // Release memexport temporary registers:
+    // - system_temp_memexport_enabled_and_eM_written_.
+    // - system_temp_memexport_address_.
+    // - system_temps_memexport_data_.
+    PopSystemTemp(xe::bit_count(uint32_t(memexport_eM_written)) + 2);
  }

  // Write stage-specific epilogue.
@ -1514,36 +1554,22 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
      dest = dxbc::Dest::R(system_temp_point_size_edge_flag_kill_vertex_);
      break;
    case InstructionStorageTarget::kExportAddress:
-      // Validate memexport writes (4D5307E6 has some completely invalid ones).
-      if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 ||
-          memexport_alloc_current_count_ > Shader::kMaxMemExports ||
-          system_temps_memexport_address_[memexport_alloc_current_count_ - 1] ==
-              UINT32_MAX) {
+      if (!current_shader().memexport_eM_written()) {
        return;
      }
-      dest = dxbc::Dest::R(
-          system_temps_memexport_address_[memexport_alloc_current_count_ - 1]);
+      dest = dxbc::Dest::R(system_temp_memexport_address_);
      break;
    case InstructionStorageTarget::kExportData: {
-      // Validate memexport writes (4D5307E6 has some completely invalid ones).
-      if (memexport_alloc_current_count_ == 0 ||
-          memexport_alloc_current_count_ > Shader::kMaxMemExports ||
-          system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
-                                      [result.storage_index] == UINT32_MAX) {
-        return;
-      }
-      dest = dxbc::Dest::R(
-          system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
-                                      [result.storage_index]);
+      assert_not_zero(current_shader().memexport_eM_written() &
+                      (uint8_t(1) << result.storage_index));
+      dest = dxbc::Dest::R(system_temps_memexport_data_[result.storage_index]);
      // Mark that the eM# has been written to and needs to be exported.
      assert_not_zero(used_write_mask);
-      uint32_t memexport_index = memexport_alloc_current_count_ - 1;
-      a_.OpOr(dxbc::Dest::R(system_temp_memexport_written_,
-                            1 << (memexport_index >> 2)),
-              dxbc::Src::R(system_temp_memexport_written_)
-                  .Select(memexport_index >> 2),
-              dxbc::Src::LU(uint32_t(1) << (result.storage_index +
-                                            ((memexport_index & 3) << 3))));
+      a_.OpOr(
+          dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010),
+          dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_,
+                       dxbc::Src::kYYYY),
+          dxbc::Src::LU(uint8_t(1) << result.storage_index));
    } break;
    case InstructionStorageTarget::kColor:
      assert_not_zero(used_write_mask);
@ -1990,15 +2016,38 @@ void DxbcShaderTranslator::ProcessJumpInstruction(
 }

 void DxbcShaderTranslator::ProcessAllocInstruction(
-    const ParsedAllocInstruction& instr) {
+    const ParsedAllocInstruction& instr, uint8_t export_eM) {
+  bool start_memexport = instr.type == AllocType::kMemory &&
+                         current_shader().memexport_eM_written();
+  if (export_eM || start_memexport) {
+    CloseExecConditionals();
+  }
+
  if (emit_source_map_) {
    instruction_disassembly_buffer_.Reset();
    instr.Disassemble(&instruction_disassembly_buffer_);
    EmitInstructionDisassembly();
  }

-  if (instr.type == AllocType::kMemory) {
-    ++memexport_alloc_current_count_;
+  if (export_eM) {
+    ExportToMemory(export_eM);
+    // Reset which eM# elements have been written.
+    a_.OpMov(
+        dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010),
+        dxbc::Src::LU(0));
+    // Break dependencies from the previous memexport.
+    uint8_t export_eM_remaining = export_eM;
+    uint32_t eM_index;
+    while (xe::bit_scan_forward(export_eM_remaining, &eM_index)) {
+      export_eM_remaining &= ~(uint8_t(1) << eM_index);
+      a_.OpMov(dxbc::Dest::R(system_temps_memexport_data_[eM_index]),
+               dxbc::Src::LF(0.0f));
+    }
+  }
+
+  if (start_memexport) {
+    // Initialize eA to an invalid address.
+    a_.OpMov(dxbc::Dest::R(system_temp_memexport_address_), dxbc::Src::LU(0));
  }
 }

@ -2851,7 +2900,7 @@ void DxbcShaderTranslator::WriteInputSignature() {
    // Sample index (SV_SampleIndex) for safe memexport with sample-rate
    // shading.
    size_t sample_index_position = SIZE_MAX;
-    if (current_shader().is_valid_memexport_used() && IsSampleRate()) {
+    if (current_shader().memexport_eM_written() && IsSampleRate()) {
      size_t sample_index_position = shader_object_.size();
      shader_object_.resize(shader_object_.size() + kParameterDwords);
      ++parameter_count;
@ -3625,7 +3674,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
          dxbc::Name::kPosition);
    }
    bool sample_rate_memexport =
-        current_shader().is_valid_memexport_used() && IsSampleRate();
+        current_shader().memexport_eM_written() && IsSampleRate();
    // Sample-rate shading can't be done with UAV-only rendering (sample-rate
    // shading is only needed for float24 depth conversion when using a float32
    // host depth buffer).
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
@ -20,6 +20,7 @@
 #include "xenia/base/string_buffer.h"
 #include "xenia/gpu/dxbc.h"
 #include "xenia/gpu/shader_translator.h"
+#include "xenia/gpu/ucode.h"
 #include "xenia/ui/graphics_provider.h"

 namespace xe {
@ -589,13 +590,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
  void ProcessLoopEndInstruction(
      const ParsedLoopEndInstruction& instr) override;
  void ProcessJumpInstruction(const ParsedJumpInstruction& instr) override;
-  void ProcessAllocInstruction(const ParsedAllocInstruction& instr) override;
+  void ProcessAllocInstruction(const ParsedAllocInstruction& instr,
+                               uint8_t export_eM) override;

  void ProcessVertexFetchInstruction(
      const ParsedVertexFetchInstruction& instr) override;
  void ProcessTextureFetchInstruction(
      const ParsedTextureFetchInstruction& instr) override;
-  void ProcessAluInstruction(const ParsedAluInstruction& instr) override;
+  void ProcessAluInstruction(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before) override;

 private:
  // IF ANY OF THESE ARE CHANGED, WriteInputSignature and WriteOutputSignature
@ -674,6 +678,11 @@ class DxbcShaderTranslator : public ShaderTranslator {
  // Frees the last allocated internal r# registers for later reuse.
  void PopSystemTemp(uint32_t count = 1);

+  // ExportToMemory modifies the values of eA/eM# for simplicity, call only
+  // before starting a new export or ending the invocation or making it
+  // inactive.
+  void ExportToMemory(uint8_t export_eM);
+
  // Converts one scalar from piecewise linear gamma to linear. The target may
  // be the same as the source, the temporary variables must be different. If
  // the source is not pre-saturated, saturation will be done internally.
@ -728,7 +737,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
  bool ROV_IsDepthStencilEarly() const {
    assert_true(edram_rov_used_);
    return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() &&
-           !current_shader().is_valid_memexport_used();
+           !current_shader().memexport_eM_written();
  }
  // Converts the pre-clamped depth value to 24-bit (storing the result in bits
  // 0:23 and zeros in 24:31, not creating room for stencil - since this may be
@ -787,14 +796,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
  void StartPixelShader_LoadROVParameters();
  void StartPixelShader();

-  // Writing the epilogue.
-  // ExportToMemory modifies the values of eA/eM# for simplicity, don't call
-  // multiple times.
-  void ExportToMemory_PackFixed32(const uint32_t* eM_temps, uint32_t eM_count,
-                                  const uint32_t bits[4],
-                                  const dxbc::Src& is_integer,
-                                  const dxbc::Src& is_signed);
-  void ExportToMemory();
  void CompleteVertexOrDomainShader();
  // For RTV, adds the sample to coverage_temp.coverage_temp_component if it
  // passes alpha to mask (or, if initialize == true (for the first sample
@ -917,13 +918,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
        .SelectFromSwizzled(word_index & 1);
  }

-  void KillPixel(bool condition, const dxbc::Src& condition_src);
+  void KillPixel(bool condition, const dxbc::Src& condition_src,
+                 uint8_t memexport_eM_potentially_written_before);

-  void ProcessVectorAluOperation(const ParsedAluInstruction& instr,
-                                 uint32_t& result_swizzle,
-                                 bool& predicate_written);
-  void ProcessScalarAluOperation(const ParsedAluInstruction& instr,
+  void ProcessVectorAluOperation(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle,
      bool& predicate_written);
+  void ProcessScalarAluOperation(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before, bool& predicate_written);

  void WriteResourceDefinition();
  void WriteInputSignature();
@ -1124,14 +1128,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
  // writing).
  uint32_t system_temps_color_[4];

-  // Bits containing whether each eM# has been written, for up to 16 streams, or
-  // UINT32_MAX if memexport is not used. 8 bits (5 used) for each stream, with
-  // 4 `alloc export`s per component.
-  uint32_t system_temp_memexport_written_;
-  // eA in each `alloc export`, or UINT32_MAX if not used.
-  uint32_t system_temps_memexport_address_[Shader::kMaxMemExports];
-  // eM# in each `alloc export`, or UINT32_MAX if not used.
-  uint32_t system_temps_memexport_data_[Shader::kMaxMemExports][5];
+  // Memory export temporary registers are allocated if the shader writes any
+  // eM# (current_shader().memexport_eM_written() != 0).
+  // X - whether memexport is enabled for this invocation.
+  // Y - which eM# elements have been written so far by the invocation since the
+  //     last memory write.
+  uint32_t system_temp_memexport_enabled_and_eM_written_;
+  // eA.
+  uint32_t system_temp_memexport_address_;
+  // eM#.
+  uint32_t system_temps_memexport_data_[ucode::kMaxMemExportElementCount];

  // Vector ALU or fetch result / scratch (since Xenos write masks can contain
  // swizzles).
@ -1195,10 +1201,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
  uint32_t uav_index_edram_;

  std::vector<SamplerBinding> sampler_bindings_;
-
-  // Number of `alloc export`s encountered so far in the translation. The index
-  // of the current eA/eM# temp register set is this minus 1, if it's not 0.
-  uint32_t memexport_alloc_current_count_;
 };

 }  // namespace gpu
--- a/src/xenia/gpu/dxbc_shader_translator_alu.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc
@ -19,22 +19,29 @@ namespace xe {
 namespace gpu {
 using namespace ucode;

-void DxbcShaderTranslator::KillPixel(bool condition,
-                                     const dxbc::Src& condition_src) {
+void DxbcShaderTranslator::KillPixel(
+    bool condition, const dxbc::Src& condition_src,
+    uint8_t memexport_eM_potentially_written_before) {
+  a_.OpIf(condition, condition_src);
+  // Perform outstanding memory exports before the invocation becomes inactive
+  // and UAV writes are disabled.
+  ExportToMemory(memexport_eM_potentially_written_before);
  // Discard the pixel, but continue execution if other lanes in the quad need
  // this lane for derivatives. The driver may also perform early exiting
  // internally if all lanes are discarded if deemed beneficial.
-  a_.OpDiscard(condition, condition_src);
+  a_.OpDiscard(true, dxbc::Src::LU(UINT32_MAX));
  if (edram_rov_used_) {
    // Even though discarding disables all subsequent UAV/ROV writes, also skip
    // as much of the Render Backend emulation logic as possible by setting the
    // coverage and the mask of the written render targets to zero.
    a_.OpMov(dxbc::Dest::R(system_temp_rov_params_, 0b0001), dxbc::Src::LU(0));
  }
+  a_.OpEndIf();
 }

 void DxbcShaderTranslator::ProcessVectorAluOperation(
-    const ParsedAluInstruction& instr, uint32_t& result_swizzle,
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle,
    bool& predicate_written) {
  result_swizzle = dxbc::Src::kXYZW;
  predicate_written = false;
@ -506,7 +513,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
      a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
      if (used_result_components) {
        a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                 dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@ -522,7 +530,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
      a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
      if (used_result_components) {
        a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                 dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@ -538,7 +547,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
      a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
      if (used_result_components) {
        a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                 dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@ -554,7 +564,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
      a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
      if (used_result_components) {
        a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                 dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@ -640,7 +651,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
 }

 void DxbcShaderTranslator::ProcessScalarAluOperation(
-    const ParsedAluInstruction& instr, bool& predicate_written) {
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before, bool& predicate_written) {
  predicate_written = false;

  if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev) {
@ -950,27 +962,27 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(

    case AluScalarOpcode::kKillsEq:
      a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
      a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
      break;
    case AluScalarOpcode::kKillsGt:
      a_.OpLT(ps_dest, dxbc::Src::LF(0.0f), operand_0_a);
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
      a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
      break;
    case AluScalarOpcode::kKillsGe:
      a_.OpGE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
      a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
      break;
    case AluScalarOpcode::kKillsNe:
      a_.OpNE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
      a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
      break;
    case AluScalarOpcode::kKillsOne:
      a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(1.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
      a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
      break;

@ -1024,7 +1036,8 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
 }

 void DxbcShaderTranslator::ProcessAluInstruction(
-    const ParsedAluInstruction& instr) {
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before) {
  if (instr.IsNop()) {
    // Don't even disassemble or update predication.
    return;
@ -1041,10 +1054,11 @@ void DxbcShaderTranslator::ProcessAluInstruction(
  // checked again later.
  bool predicate_written_vector = false;
  uint32_t vector_result_swizzle = dxbc::Src::kXYZW;
-  ProcessVectorAluOperation(instr, vector_result_swizzle,
-                            predicate_written_vector);
+  ProcessVectorAluOperation(instr, memexport_eM_potentially_written_before,
+                            vector_result_swizzle, predicate_written_vector);
  bool predicate_written_scalar = false;
-  ProcessScalarAluOperation(instr, predicate_written_scalar);
+  ProcessScalarAluOperation(instr, memexport_eM_potentially_written_before,
+                            predicate_written_scalar);

  StoreResult(instr.vector_and_constant_result,
              dxbc::Src::R(system_temp_result_, vector_result_swizzle),
--- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@ -672,7 +672,7 @@ class Shader {
    // For implementation without unconditional support for memory writes from
    // vertex shaders, vertex shader converted to a compute shader doing only
    // memory export.
-    kMemexportCompute,
+    kMemExportCompute,

    // 4 host vertices for 1 guest vertex, for implementations without
    // unconditional geometry shader support.
@ -769,9 +769,16 @@ class Shader {
    }
  };

-  // Based on the number of AS_VS/PS_EXPORT_STREAM_* enum sets found in a game
-  // .pdb.
-  static constexpr uint32_t kMaxMemExports = 16;
+  struct ControlFlowMemExportInfo {
+    // Which eM elements have potentially (regardless of conditionals, loop
+    // iteration counts, predication) been written earlier in the predecessor
+    // graph of the instruction since an `alloc export`.
+    uint8_t eM_potentially_written_before = 0;
+    // For exec sequences, which eM elements are potentially (regardless of
+    // predication) written by the instructions in the sequence. For other
+    // control flow instructions, it's 0.
+    uint8_t eM_potentially_written_by_exec = 0;
+  };

  class Translation {
   public:
@ -879,19 +886,21 @@ class Shader {
    return constant_register_map_;
  }

-  // uint5[Shader::kMaxMemExports] - bits indicating which eM# registers have
-  // been written to after each `alloc export`, for up to Shader::kMaxMemExports
-  // exports. This will contain zero for certain corrupt exports - for those to
-  // which a valid eA was not written via a MAD with a stream constant.
-  const uint8_t* memexport_eM_written() const { return memexport_eM_written_; }
+  // Information about memory export state at each control flow instruction. May
+  // be empty if there are no eM# writes.
+  const std::vector<ControlFlowMemExportInfo>& cf_memexport_info() const {
+    return cf_memexport_info_;
+  }

-  // All c# registers used as the addend in MAD operations to eA.
+  uint8_t memexport_eM_written() const { return memexport_eM_written_; }
+  uint8_t memexport_eM_potentially_written_before_end() const {
+    return memexport_eM_potentially_written_before_end_;
+  }
+
+  // c# registers used as the addend in MAD operations to eA.
  const std::set<uint32_t>& memexport_stream_constants() const {
    return memexport_stream_constants_;
  }
-  bool is_valid_memexport_used() const {
-    return !memexport_stream_constants_.empty();
-  }

  // Labels that jumps (explicit or from loops) can be done to.
  const std::set<uint32_t>& label_addresses() const { return label_addresses_; }
@ -969,7 +978,7 @@ class Shader {
    // TODO(Triang3l): Investigate what happens to memexport when the pixel
    // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early
    // depth/stencil.
-    return !kills_pixels() && !writes_depth() && !is_valid_memexport_used();
+    return !kills_pixels() && !writes_depth() && !memexport_eM_written();
  }

  // Whether each color render target is written to on any execution path.
@ -1041,8 +1050,6 @@ class Shader {
  std::vector<VertexBinding> vertex_bindings_;
  std::vector<TextureBinding> texture_bindings_;
  ConstantRegisterMap constant_register_map_ = {0};
-  uint8_t memexport_eM_written_[kMaxMemExports] = {};
-  std::set<uint32_t> memexport_stream_constants_;
  std::set<uint32_t> label_addresses_;
  uint32_t cf_pair_index_bound_ = 0;
  uint32_t register_static_address_bound_ = 0;
@ -1054,6 +1061,17 @@ class Shader {
  bool uses_texture_fetch_instruction_results_ = false;
  bool writes_depth_ = false;

+  // Memory export eM write info for each control flow instruction, if there are
+  // any eM writes in the shader.
+  std::vector<ControlFlowMemExportInfo> cf_memexport_info_;
+  // Which memexport elements (eM#) are written for any memexport in the shader.
+  uint8_t memexport_eM_written_ = 0;
+  // ControlFlowMemExportInfo::eM_potentially_written_before equivalent for the
+  // end of the shader, for the last memory export (or exports if the end has
+  // multiple predecessor chains exporting to memory).
+  uint8_t memexport_eM_potentially_written_before_end_ = 0;
+  std::set<uint32_t> memexport_stream_constants_;
+
  // Modification bits -> translation.
  std::unordered_map<uint64_t, Translation*> translations_;

@ -1063,8 +1081,7 @@ class Shader {
  void GatherExecInformation(
      const ParsedExecInstruction& instr,
      ucode::VertexFetchInstruction& previous_vfetch_full,
-      uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
-      uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer);
+      uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer);
  void GatherVertexFetchInformation(
      const ucode::VertexFetchInstruction& op,
      ucode::VertexFetchInstruction& previous_vfetch_full,
@ -1073,13 +1090,12 @@ class Shader {
                                     uint32_t& unique_texture_bindings,
                                     StringBuffer& ucode_disasm_buffer);
  void GatherAluInstructionInformation(const ucode::AluInstruction& op,
-                                       uint32_t memexport_alloc_current_count,
-                                       uint32_t& memexport_eA_written,
+                                       uint32_t exec_cf_index,
                                       StringBuffer& ucode_disasm_buffer);
  void GatherOperandInformation(const InstructionOperand& operand);
  void GatherFetchResultInformation(const InstructionResult& result);
  void GatherAluResultInformation(const InstructionResult& result,
-                                  uint32_t memexport_alloc_current_count);
+                                  uint32_t exec_cf_index);
 };

 }  // namespace gpu
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@ -87,8 +87,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
  VertexFetchInstruction previous_vfetch_full;
  std::memset(&previous_vfetch_full, 0, sizeof(previous_vfetch_full));
  uint32_t unique_texture_bindings = 0;
-  uint32_t memexport_alloc_count = 0;
-  uint32_t memexport_eA_written = 0;
  for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
    ControlFlowInstruction cf_ab[2];
    UnpackControlFlowInstructions(ucode_data_.data() + i * 3, cf_ab);
@ -111,8 +109,7 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
          ParsedExecInstruction instr;
          ParseControlFlowExec(cf.exec, cf_index, instr);
          GatherExecInformation(instr, previous_vfetch_full,
-                                unique_texture_bindings, memexport_alloc_count,
-                                memexport_eA_written, ucode_disasm_buffer);
+                                unique_texture_bindings, ucode_disasm_buffer);
        } break;
        case ControlFlowOpcode::kCondExec:
        case ControlFlowOpcode::kCondExecEnd:
@ -122,16 +119,14 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
          ParsedExecInstruction instr;
          ParseControlFlowCondExec(cf.cond_exec, cf_index, instr);
          GatherExecInformation(instr, previous_vfetch_full,
-                                unique_texture_bindings, memexport_alloc_count,
-                                memexport_eA_written, ucode_disasm_buffer);
+                                unique_texture_bindings, ucode_disasm_buffer);
        } break;
        case ControlFlowOpcode::kCondExecPred:
        case ControlFlowOpcode::kCondExecPredEnd: {
          ParsedExecInstruction instr;
          ParseControlFlowCondExecPred(cf.cond_exec_pred, cf_index, instr);
          GatherExecInformation(instr, previous_vfetch_full,
-                                unique_texture_bindings, memexport_alloc_count,
-                                memexport_eA_written, ucode_disasm_buffer);
+                                unique_texture_bindings, ucode_disasm_buffer);
        } break;
        case ControlFlowOpcode::kLoopStart: {
          ParsedLoopStartInstruction instr;
@ -173,9 +168,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
          ParseControlFlowAlloc(cf.alloc, cf_index,
                                type() == xenos::ShaderType::kVertex, instr);
          instr.Disassemble(&ucode_disasm_buffer);
-          if (instr.type == AllocType::kMemory) {
-            ++memexport_alloc_count;
-          }
        } break;
        case ControlFlowOpcode::kMarkVsFetchDone:
          break;
@ -187,7 +179,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
        constant_register_map_.bool_bitmap[bool_constant_index / 32] |=
            uint32_t(1) << (bool_constant_index % 32);
      }
-      // TODO(benvanik): break if (DoesControlFlowOpcodeEndShader(cf.opcode()))?
    }
  }
  ucode_disassembly_ = ucode_disasm_buffer.to_string();
@ -206,16 +197,124 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
    }
  }

-  // Cleanup invalid/unneeded memexport allocs.
-  for (uint32_t i = 0; i < kMaxMemExports; ++i) {
-    if (!(memexport_eA_written & (uint32_t(1) << i))) {
-      memexport_eM_written_[i] = 0;
-    } else if (!memexport_eM_written_[i]) {
-      memexport_eA_written &= ~(uint32_t(1) << i);
+  if (!cf_memexport_info_.empty()) {
+    // Gather potentially "dirty" memexport elements before each control flow
+    // instruction. `alloc` (any, not only `export`) flushes the previous memory
+    // export. On the guest GPU, yielding / serializing also terminates memory
+    // exports, but for simplicity disregarding that, as that functionally does
+    // nothing compared to flushing the previous memory export only at `alloc`
+    // or even only specifically at `alloc export`, Microsoft's validator checks
+    // if eM# aren't written after a `serialize`.
+    std::vector<uint32_t> successor_stack;
+    for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
+      ControlFlowInstruction eM_writing_cf_ab[2];
+      UnpackControlFlowInstructions(ucode_data_.data() + i * 3,
+                                    eM_writing_cf_ab);
+      for (uint32_t j = 0; j < 2; ++j) {
+        uint32_t eM_writing_cf_index = i * 2 + j;
+        uint32_t eM_written_by_cf_instr =
+            cf_memexport_info_[eM_writing_cf_index]
+                .eM_potentially_written_by_exec;
+        if (eM_writing_cf_ab[j].opcode() == ControlFlowOpcode::kCondCall) {
+          // Until subroutine calls are handled accurately, assume that all eM#
+          // have potentially been written by the subroutine for simplicity.
+          eM_written_by_cf_instr = memexport_eM_written_;
+        }
+        if (!eM_written_by_cf_instr) {
+          continue;
+        }
+
+        // If the control flow instruction potentially results in any eM# being
+        // written, mark those eM# as potentially written before each successor.
+        bool is_successor_graph_head = true;
+        successor_stack.push_back(eM_writing_cf_index);
+        while (!successor_stack.empty()) {
+          uint32_t successor_cf_index = successor_stack.back();
+          successor_stack.pop_back();
+
+          ControlFlowMemExportInfo& successor_memexport_info =
+              cf_memexport_info_[successor_cf_index];
+          if ((successor_memexport_info.eM_potentially_written_before &
+               eM_written_by_cf_instr) == eM_written_by_cf_instr) {
+            // Already marked as written before this instruction (and thus
+            // before all its successors too). Possibly this instruction is in a
+            // loop, in this case an instruction may succeed itself.
+            break;
+          }
+          // The first instruction in the traversal is the writing instruction
+          // itself, not its successor. However, if it has been visited by the
+          // traversal twice, it's in a loop, so it succeeds itself, and thus
+          // writes from it are potentially done before it too.
+          if (!is_successor_graph_head) {
+            successor_memexport_info.eM_potentially_written_before |=
+                eM_written_by_cf_instr;
+          }
+          is_successor_graph_head = false;
+
+          ControlFlowInstruction successor_cf_ab[2];
+          UnpackControlFlowInstructions(
+              ucode_data_.data() + (successor_cf_index >> 1) * 3,
+              successor_cf_ab);
+          const ControlFlowInstruction& successor_cf =
+              successor_cf_ab[successor_cf_index & 1];
+
+          bool next_instr_is_new_successor = true;
+          switch (successor_cf.opcode()) {
+            case ControlFlowOpcode::kExecEnd:
+              // One successor: end.
+              memexport_eM_potentially_written_before_end_ |=
+                  eM_written_by_cf_instr;
+              next_instr_is_new_successor = false;
+              break;
+            case ControlFlowOpcode::kCondExecEnd:
+            case ControlFlowOpcode::kCondExecPredEnd:
+            case ControlFlowOpcode::kCondExecPredCleanEnd:
+              // Two successors: next, end.
+              memexport_eM_potentially_written_before_end_ |=
+                  eM_written_by_cf_instr;
+              break;
+            case ControlFlowOpcode::kLoopStart:
+              // Two successors: next, skip.
+              successor_stack.push_back(successor_cf.loop_start.address());
+              break;
+            case ControlFlowOpcode::kLoopEnd:
+              // Two successors: next, repeat.
+              successor_stack.push_back(successor_cf.loop_end.address());
+              break;
+            case ControlFlowOpcode::kCondCall:
+              // Two successors: next, target.
+              successor_stack.push_back(successor_cf.cond_call.address());
+              break;
+            case ControlFlowOpcode::kReturn:
+              // Currently treating all subroutine calls as potentially writing
+              // all eM# for simplicity, so just exit the subroutine.
+              next_instr_is_new_successor = false;
+              break;
+            case ControlFlowOpcode::kCondJmp:
+              // One or two successors: next if conditional, target.
+              successor_stack.push_back(successor_cf.cond_jmp.address());
+              if (successor_cf.cond_jmp.is_unconditional()) {
+                next_instr_is_new_successor = false;
+              }
+              break;
+            case ControlFlowOpcode::kAlloc:
+              // Any `alloc` ends the previous export.
+              next_instr_is_new_successor = false;
+              break;
+            default:
+              break;
+          }
+          if (next_instr_is_new_successor) {
+            if (successor_cf_index < (cf_pair_index_bound_ << 1)) {
+              successor_stack.push_back(successor_cf_index + 1);
+            } else {
+              memexport_eM_potentially_written_before_end_ |=
+                  eM_written_by_cf_instr;
+            }
+          }
+        }
      }
    }
-  if (memexport_eA_written == 0) {
-    memexport_stream_constants_.clear();
  }

  is_ucode_analyzed_ = true;
@ -250,8 +349,7 @@ uint32_t Shader::GetInterpolatorInputMask(reg::SQ_PROGRAM_CNTL sq_program_cntl,
 void Shader::GatherExecInformation(
    const ParsedExecInstruction& instr,
    ucode::VertexFetchInstruction& previous_vfetch_full,
-    uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
-    uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
+    uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer) {
  instr.Disassemble(&ucode_disasm_buffer);
  uint32_t sequence = instr.sequence;
  for (uint32_t instr_offset = instr.instruction_address;
@ -273,8 +371,7 @@ void Shader::GatherExecInformation(
      }
    } else {
      auto& op = *reinterpret_cast<const AluInstruction*>(op_ptr);
-      GatherAluInstructionInformation(op, memexport_alloc_current_count,
-                                      memexport_eA_written,
+      GatherAluInstructionInformation(op, instr.dword_index,
                                      ucode_disasm_buffer);
    }
  }
@ -381,8 +478,8 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op,
 }

 void Shader::GatherAluInstructionInformation(
-    const AluInstruction& op, uint32_t memexport_alloc_current_count,
-    uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
+    const AluInstruction& op, uint32_t exec_cf_index,
+    StringBuffer& ucode_disasm_buffer) {
  ParsedAluInstruction instr;
  ParseAluInstruction(op, type(), instr);
  instr.Disassemble(&ucode_disasm_buffer);
@ -394,10 +491,8 @@ void Shader::GatherAluInstructionInformation(
      (ucode::GetAluScalarOpcodeInfo(op.scalar_opcode()).changed_state &
       ucode::kAluOpChangedStatePixelKill);

-  GatherAluResultInformation(instr.vector_and_constant_result,
-                             memexport_alloc_current_count);
-  GatherAluResultInformation(instr.scalar_result,
-                             memexport_alloc_current_count);
+  GatherAluResultInformation(instr.vector_and_constant_result, exec_cf_index);
+  GatherAluResultInformation(instr.scalar_result, exec_cf_index);
  for (size_t i = 0; i < instr.vector_operand_count; ++i) {
    GatherOperandInformation(instr.vector_operands[i]);
  }
@ -405,9 +500,7 @@ void Shader::GatherAluInstructionInformation(
    GatherOperandInformation(instr.scalar_operands[i]);
  }

-  // Store used memexport constants because CPU code needs addresses and sizes,
-  // and also whether there have been writes to eA and eM# for register
-  // allocation in shader translator implementations.
+  // Store used memexport constants because CPU code needs addresses and sizes.
  // eA is (hopefully) always written to using:
  // mad eA, r#, const0100, c#
  // (though there are some exceptions, shaders in 4D5307E6 for some reason set
@ -416,13 +509,9 @@ void Shader::GatherAluInstructionInformation(
  // Export is done to vector_dest of the ucode instruction for both vector and
  // scalar operations - no need to check separately.
  if (instr.vector_and_constant_result.storage_target ==
-          InstructionStorageTarget::kExportAddress &&
-      memexport_alloc_current_count > 0 &&
-      memexport_alloc_current_count <= Shader::kMaxMemExports) {
+      InstructionStorageTarget::kExportAddress) {
    uint32_t memexport_stream_constant = instr.GetMemExportStreamConstant();
    if (memexport_stream_constant != UINT32_MAX) {
-      memexport_eA_written |= uint32_t(1)
-                              << (memexport_alloc_current_count - 1);
      memexport_stream_constants_.insert(memexport_stream_constant);
    } else {
      XELOGE(
@ -481,8 +570,8 @@ void Shader::GatherFetchResultInformation(const InstructionResult& result) {
  }
 }

-void Shader::GatherAluResultInformation(
-    const InstructionResult& result, uint32_t memexport_alloc_current_count) {
+void Shader::GatherAluResultInformation(const InstructionResult& result,
+                                        uint32_t exec_cf_index) {
  uint32_t used_write_mask = result.GetUsedWriteMask();
  if (!used_write_mask) {
    return;
@ -504,11 +593,12 @@ void Shader::GatherAluResultInformation(
      writes_point_size_edge_flag_kill_vertex_ |= used_write_mask;
      break;
    case InstructionStorageTarget::kExportData:
-      if (memexport_alloc_current_count > 0 &&
-          memexport_alloc_current_count <= Shader::kMaxMemExports) {
-        memexport_eM_written_[memexport_alloc_current_count - 1] |=
-            uint32_t(1) << result.storage_index;
+      memexport_eM_written_ |= uint8_t(1) << result.storage_index;
+      if (cf_memexport_info_.empty()) {
+        cf_memexport_info_.resize(2 * cf_pair_index_bound_);
      }
+      cf_memexport_info_[exec_cf_index].eM_potentially_written_by_exec |=
+          uint32_t(1) << result.storage_index;
      break;
    case InstructionStorageTarget::kColor:
      writes_color_targets_ |= uint32_t(1) << result.storage_index;
@ -665,7 +755,13 @@ void ShaderTranslator::TranslateControlFlowInstruction(
    case ControlFlowOpcode::kAlloc: {
      ParsedAllocInstruction instr;
      ParseControlFlowAlloc(cf.alloc, cf_index_, is_vertex_shader(), instr);
-      ProcessAllocInstruction(instr);
+      const std::vector<Shader::ControlFlowMemExportInfo>& cf_memexport_info =
+          current_shader().cf_memexport_info();
+      ProcessAllocInstruction(instr,
+                              instr.dword_index < cf_memexport_info.size()
+                                  ? cf_memexport_info[instr.dword_index]
+                                        .eM_potentially_written_before
+                                  : 0);
    } break;
    case ControlFlowOpcode::kMarkVsFetchDone:
      break;
@ -807,6 +903,14 @@ void ParseControlFlowAlloc(const ControlFlowAllocInstruction& cf,
 void ShaderTranslator::TranslateExecInstructions(
    const ParsedExecInstruction& instr) {
  ProcessExecInstructionBegin(instr);
+
+  const std::vector<Shader::ControlFlowMemExportInfo>& cf_memexport_info =
+      current_shader().cf_memexport_info();
+  uint8_t eM_potentially_written_before =
+      instr.dword_index < cf_memexport_info.size()
+          ? cf_memexport_info[instr.dword_index].eM_potentially_written_before
+          : 0;
+
  const uint32_t* ucode_dwords = current_shader().ucode_data().data();
  uint32_t sequence = instr.sequence;
  for (uint32_t instr_offset = instr.instruction_address;
@ -832,9 +936,22 @@ void ShaderTranslator::TranslateExecInstructions(
      auto& op = *reinterpret_cast<const AluInstruction*>(op_ptr);
      ParsedAluInstruction alu_instr;
      ParseAluInstruction(op, current_shader().type(), alu_instr);
-      ProcessAluInstruction(alu_instr);
+      ProcessAluInstruction(alu_instr, eM_potentially_written_before);
+      if (alu_instr.vector_and_constant_result.storage_target ==
+              InstructionStorageTarget::kExportData &&
+          alu_instr.vector_and_constant_result.GetUsedWriteMask()) {
+        eM_potentially_written_before |=
+            uint8_t(1) << alu_instr.vector_and_constant_result.storage_index;
+      }
+      if (alu_instr.scalar_result.storage_target ==
+              InstructionStorageTarget::kExportData &&
+          alu_instr.scalar_result.GetUsedWriteMask()) {
+        eM_potentially_written_before |=
+            uint8_t(1) << alu_instr.scalar_result.storage_index;
      }
    }
+  }
+
  ProcessExecInstructionEnd(instr);
 }

--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@ -118,8 +118,10 @@ class ShaderTranslator {
  virtual void ProcessReturnInstruction(const ParsedReturnInstruction& instr) {}
  // Handles translation for jump instructions.
  virtual void ProcessJumpInstruction(const ParsedJumpInstruction& instr) {}
-  // Handles translation for alloc instructions.
-  virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr) {}
+  // Handles translation for alloc instructions. Memory exports for eM#
+  // indicated by export_eM must be performed, regardless of the alloc type.
+  virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr,
+                                       uint8_t export_eM) {}

  // Handles translation for vertex fetch instructions.
  virtual void ProcessVertexFetchInstruction(
@ -128,7 +130,13 @@ class ShaderTranslator {
  virtual void ProcessTextureFetchInstruction(
      const ParsedTextureFetchInstruction& instr) {}
  // Handles translation for ALU instructions.
-  virtual void ProcessAluInstruction(const ParsedAluInstruction& instr) {}
+  // memexport_eM_potentially_written_before needs to be handled by `kill`
+  // instruction to make sure memory exports for the eM# writes earlier in
+  // previous execs and the current exec are done before the invocation becomes
+  // inactive.
+  virtual void ProcessAluInstruction(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before) {}

 private:
  void TranslateControlFlowInstruction(const ucode::ControlFlowInstruction& cf);
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@ -134,7 +134,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
    // (32-bit only - 16-bit indices are always fetched via the Vulkan index
    // buffer).
    kSysFlag_VertexIndexLoad = 1u << kSysFlag_VertexIndexLoad_Shift,
-    // For HostVertexShaderTypes kMemexportCompute, kPointListAsTriangleStrip,
+    // For HostVertexShaderTypes kMemExportCompute, kPointListAsTriangleStrip,
    // kRectangleListAsTriangleStrip, whether the vertex index needs to be
    // loaded from the index buffer (rather than using autogenerated indices),
    // and whether it's 32-bit. This is separate from kSysFlag_VertexIndexLoad
@ -427,7 +427,9 @@ class SpirvShaderTranslator : public ShaderTranslator {
      const ParsedVertexFetchInstruction& instr) override;
  void ProcessTextureFetchInstruction(
      const ParsedTextureFetchInstruction& instr) override;
-  void ProcessAluInstruction(const ParsedAluInstruction& instr) override;
+  void ProcessAluInstruction(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before) override;

 private:
  struct TextureBinding {
@ -620,7 +622,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
    assert_true(edram_fragment_shader_interlock_);
    return !is_depth_only_fragment_shader_ &&
           !current_shader().writes_depth() &&
-           !current_shader().is_valid_memexport_used();
+           !current_shader().memexport_eM_written();
  }
  void FSI_LoadSampleMask(spv::Id msaa_samples);
  void FSI_LoadEdramOffsets(spv::Id msaa_samples);
--- a/src/xenia/gpu/spirv_shader_translator_alu.cc
+++ b/src/xenia/gpu/spirv_shader_translator_alu.cc
@ -67,7 +67,8 @@ void SpirvShaderTranslator::KillPixel(spv::Id condition) {
 }

 void SpirvShaderTranslator::ProcessAluInstruction(
-    const ParsedAluInstruction& instr) {
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before) {
  if (instr.IsNop()) {
    // Don't even disassemble or update predication.
    return;
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@ -210,7 +210,7 @@ enum class AllocType : uint32_t {
  kVsInterpolators = 2,
  // Pixel shader exports colors.
  kPsColors = 2,
-  // MEMEXPORT?
+  // Memory export.
  kMemory = 3,
 };

@ -1782,6 +1782,9 @@ inline uint32_t GetAluVectorOpNeededSourceComponents(
                          .operand_components_used[src_index - 1];
 }

+// eM# (kExportData) register count.
+constexpr uint32_t kMaxMemExportElementCount = 5;
+
 enum class ExportRegister : uint32_t {
  kVSInterpolator0 = 0,
  kVSInterpolator1,
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@ -2187,7 +2187,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
    return false;
  }
  pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
-  bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
+  bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0;

  // Pixel shader analysis.
  bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -497,6 +497,18 @@ enum class TextureFormat : uint32_t {
  k_6_5_5 = 5,
  k_8_8_8_8 = 6,
  k_2_10_10_10 = 7,
+  // Possibly similar to k_8, but may be storing alpha instead of red when
+  // resolving/memexporting, though not exactly known. From the point of view of
+  // sampling, it should be treated the same as k_8 (given that textures have
+  // the last - and single-component textures have the only - component
+  // replicated into all the remaining ones before the swizzle).
+  // Used as:
+  // - Texture in 4B4E083C - text, starting from the "Loading..." and the "This
+  //   game saves data automatically" messages. The swizzle in the fetch
+  //   constant is 111W (suggesting that internally the only component may be
+  //   the alpha one, not red).
+  // TODO(Triang3l): Investigate how k_8_A and k_8_B work in resolves and
+  // memexports, whether they store alpha/blue of the input or red.
  k_8_A = 8,
  k_8_B = 9,
  k_8_8 = 10,
@ -510,6 +522,12 @@ enum class TextureFormat : uint32_t {
  // Used for videos in 54540829.
  k_Y1_Cr_Y0_Cb_REP = 12,
  k_16_16_EDRAM = 13,
+  // Likely same as k_8_8_8_8.
+  // Used as:
+  // - Memexport destination in 4D5308BC - multiple small draws when looking
+  //   back at the door behind the player in the first room of gameplay.
+  // - Memexport destination in 4D53085B and 4D530919 - in 4D53085B, in a frame
+  //   between the intro video and the main menu, in a 8192-point draw.
  k_8_8_8_8_A = 14,
  k_4_4_4_4 = 15,
  k_10_11_11 = 16,
@ -1373,8 +1391,7 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
 // memexport on the Adreno 2xx using GL_OES_get_program_binary - it's also
 // interesting to see how alphatest interacts with it, whether it's still true
 // fixed-function alphatest, as it's claimed to be supported as usual by the
-// extension specification - it's likely, however, that memory exports are
-// discarded alongside other exports such as oC# and oDepth this way.
+// extension specification.
 //
 // Y of eA contains the offset in elements - this is what shaders are supposed
 // to calculate from something like the vertex index. Again, it's specified as
@ -1397,6 +1414,69 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
 // elements using packing via addition to 2^23, so this field also doesn't need
 // more bits than that.
 //
+// According to the sequencer specification from IPR2015-00325 (where memexport
+// is called "pass thru export"):
+// - Pass thru exports can occur anywhere in the shader program.
+// - There can be any number of pass thru exports.
+// - The address register is not kept across clause boundaries, so it must be
+//   refreshed after any Serialize (or yield), allocate instruction or resource
+//   change.
+// - The write to eM# may be predicated if the export is not needed.
+// - Exports are dropped if:
+//   - The index is above the maximum.
+//   - The index sign bit is 1.
+//   - The exponent of the index is not 23.
+// The requirement that eM4 must be written if any eM# other than eM0 is also
+// written doesn't apply to the final Xenos, it's likely an outdated note in the
+// specification considering that it's very preliminary.
+//
+// According to Microsoft's shader validator:
+// - eA can be written only by `mad`.
+// - A single eM# can be written by any number of instruction, including with
+//   write masking.
+// - eA must be written before eM#.
+// - Any alloc instruction or a `serialize` terminates the current memory
+//   export. This doesn't apply to `exec Yield=true`, however, and it's not
+//   clear if that's an oversight or if that's not considered a yield that
+//   terminates the export.
+//
+// From the emulation perspective, this means that:
+// - Alloc instructions (`alloc export` mandatorily, other allocs optionally),
+//   and optionally `serialize` instructions within `exec`, should be treated as
+//   the locations where the currently open export should be flushed to the
+//   memory. It should be taken into account that an export may be in looping
+//   control flow, and in this case it must be performed at every iteration.
+// - Whether each eM# was written to must be tracked at shader execution time,
+//   as predication can disable the export of an element.
+//
+// TODO(Triang3l): Investigate how memory export interacts with pixel killing.
+// Given that eM# writes disabled by predication don't cause an export, it's
+// possible that killed invocations are treated as inactive (invalid in Xenos
+// terms) overall, and thus new memory exports from them shouldn't be done, but
+// that's not verified. However, given that on Direct3D 11+, OpenGL and Vulkan
+// hosts, discarding disables subsequent storage resource writes, on the host,
+// it would be natural to perform all outstanding memory exports before
+// discarding if the kill condition passes.
+//
+// Memory exports can be performed to any ColorFormat, including 8bpp and 16bpp
+// ones. Hosts, however, may have the memory bound as a 32bpp buffer (for
+// instance, due to the minimum resource view size limitation on Direct3D 11).
+// In this case, bytes and shorts aren't addressable directly. However, taking
+// into account that memory accesses are coherent within one shader invocation
+// on Direct3D 11+, OpenGL and Vulkan and thus are done in order relatively to
+// each other, it should be possible to implement them by clearing the bits via
+// an atomic AND, and writing the new value using an atomic OR. This will, of
+// course, make the entire write operation non-atomic, and in case of a race
+// between writes to the same location, the final result may not even be just a
+// value from one of the invocations, but rather, it can be OR of the values
+// from any invocations involved. However, on the Xenos, there doesn't seem to
+// be any possibility of meaningfully accessing the same location from multiple
+// invocations if any of them is writing, memory exports are out-of-order, so
+// such an implementation shouldn't be causing issues in reality. Atomic
+// compare-exchange, however, should not be used for this purpose, as it may
+// result in an infinite loop if different invocations want to write different
+// values to the same memory location.
+//
 // Examples of setup in titles (Z from MSB to LSB):
 //
 // 4D5307E6 particles (different VS invocation counts, like 1, 2, 4):
@ -1432,6 +1512,11 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
 // c0: Z = 010010110000|0|010|11|011010|00011|001
 //   8in16, 16_16_16_16, uint, RGBA - from 16_16_16_16 uint vfetch
 //   (16_16_16_16 is the largest color format without special values)
+//
+// 58410B86 hierarchical depth buffer occlusion culling with the result read on
+// the CPU (15000 VS invocations in the main menu):
+// c8: Z = 010010110000|0|010|00|000010|00000|000, count = invocation count
+//   No endian swap, 8, uint, RGBA
 union alignas(uint32_t) xe_gpu_memexport_stream_t {
  struct {
    uint32_t dword_0;
--- a/src/xenia/kernel/xam/xam_content.cc
+++ b/src/xenia/kernel/xam/xam_content.cc
@ -119,6 +119,8 @@ dword_result_t XamContentCreateEnumerator_entry(
 }
 DECLARE_XAM_EXPORT1(XamContentCreateEnumerator, kContent, kImplemented);

+enum class kDispositionState : uint32_t { Unknown = 0, Create = 1, Open = 2 };
+
 dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
                                  lpvoid_t content_data_ptr,
                                  dword_t content_data_size, dword_t flags,
@ -146,40 +148,37 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
              content_data, disposition_ptr, license_mask_ptr, overlapped_ptr](
                 uint32_t& extended_error, uint32_t& length) -> X_RESULT {
    X_RESULT result = X_ERROR_INVALID_PARAMETER;
-    bool create = false;
-    bool open = false;
+    kDispositionState disposition = kDispositionState::Unknown;
    switch (flags & 0xF) {
      case 1:  // CREATE_NEW
               // Fail if exists.
        if (content_manager->ContentExists(content_data)) {
          result = X_ERROR_ALREADY_EXISTS;
        } else {
-          create = true;
+          disposition = kDispositionState::Create;
        }
        break;
      case 2:  // CREATE_ALWAYS
               // Overwrite existing, if any.
        if (content_manager->ContentExists(content_data)) {
          content_manager->DeleteContent(content_data);
-          create = true;
-        } else {
-          create = true;
        }
+        disposition = kDispositionState::Create;
        break;
      case 3:  // OPEN_EXISTING
               // Open only if exists.
        if (!content_manager->ContentExists(content_data)) {
          result = X_ERROR_PATH_NOT_FOUND;
        } else {
-          open = true;
+          disposition = kDispositionState::Open;
        }
        break;
      case 4:  // OPEN_ALWAYS
               // Create if needed.
        if (!content_manager->ContentExists(content_data)) {
-          create = true;
+          disposition = kDispositionState::Create;
        } else {
-          open = true;
+          disposition = kDispositionState::Open;
        }
        break;
      case 5:  // TRUNCATE_EXISTING
@ -188,7 +187,7 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
          result = X_ERROR_PATH_NOT_FOUND;
        } else {
          content_manager->DeleteContent(content_data);
-          create = true;
+          disposition = kDispositionState::Create;
        }
        break;
      default:
@ -196,21 +195,12 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
        break;
    }

-    // creation result
-    // 0 = ?
-    // 1 = created
-    // 2 = opened
-    uint32_t disposition = create ? 1 : 2;
-    if (disposition_ptr) {
-      *disposition_ptr = disposition;
-    }
-
-    if (create) {
+    if (disposition == kDispositionState::Create) {
      result = content_manager->CreateContent(root_name, content_data);
      if (XSUCCEEDED(result)) {
        content_manager->WriteContentHeaderFile(&content_data);
      }
-    } else if (open) {
+    } else if (disposition == kDispositionState::Open) {
      result = content_manager->OpenContent(root_name, content_data);
    }

@ -224,12 +214,11 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
    }

    extended_error = X_HRESULT_FROM_WIN32(result);
-    length = disposition;
+    length = static_cast<uint32_t>(disposition);

    if (result && overlapped_ptr) {
      result = X_ERROR_FUNCTION_FAILED;
    }
-
    return result;
  };

@ -451,7 +440,6 @@ static_assert_size(X_SWAPDISC_ERROR_MESSAGE, 12);
 dword_result_t XamSwapDisc_entry(
    dword_t disc_number, pointer_t<X_KEVENT> completion_handle,
    pointer_t<X_SWAPDISC_ERROR_MESSAGE> error_message) {
-
  xex2_opt_execution_info* info = nullptr;
  kernel_state()->GetExecutableModule()->GetOptHeader(XEX_HEADER_EXECUTION_INFO,
                                                      &info);
--- a/src/xenia/kernel/xam/xam_info.cc
+++ b/src/xenia/kernel/xam/xam_info.cc
@ -254,202 +254,15 @@ dword_result_t XGetLanguage_entry() {
 }
 DECLARE_XAM_EXPORT1(XGetLanguage, kNone, kImplemented);

-// http://www.noxa.org/blog/2011/02/28/building-an-xbox-360-emulator-part-3-feasibilityos/
-// http://www.noxa.org/blog/2011/08/13/building-an-xbox-360-emulator-part-5-xex-files/
-dword_result_t RtlSleep_entry(dword_t dwMilliseconds, dword_t bAlertable) {
-  LARGE_INTEGER delay{};
-
-  // Convert the delay time to 100-nanosecond intervals
-  delay.QuadPart = dwMilliseconds == -1
-                       ? LLONG_MAX
-                       : static_cast<LONGLONG>(-10000) * dwMilliseconds;
-
-  X_STATUS result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
-                                                     (uint64_t*)&delay);
-
-  // If the delay was interrupted by an APC, keep delaying the thread
-  while (bAlertable && result == X_STATUS_ALERTED) {
-    result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
-                                              (uint64_t*)&delay);
-  }
-
-  return result == X_STATUS_SUCCESS ? X_STATUS_SUCCESS : X_STATUS_USER_APC;
-}
-DECLARE_XAM_EXPORT1(RtlSleep, kNone, kImplemented);
-
-dword_result_t SleepEx_entry(dword_t dwMilliseconds, dword_t bAlertable) {
-  return RtlSleep_entry(dwMilliseconds, bAlertable);
-}
-DECLARE_XAM_EXPORT1(SleepEx, kNone, kImplemented);
-
-// https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-sleep
-void Sleep_entry(dword_t dwMilliseconds) {
-  RtlSleep_entry(dwMilliseconds, FALSE);
-}
-DECLARE_XAM_EXPORT1(Sleep, kNone, kImplemented);
-
-// https://learn.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-gettickcount
-dword_result_t GetTickCount_entry() { return Clock::QueryGuestUptimeMillis(); }
-DECLARE_XAM_EXPORT1(GetTickCount, kNone, kImplemented);
-
 dword_result_t XamGetCurrentTitleId_entry() {
  return kernel_state()->emulator()->title_id();
 }
 DECLARE_XAM_EXPORT1(XamGetCurrentTitleId, kNone, kImplemented);

-dword_result_t RtlSetLastNTError_entry(dword_t error_code) {
-  const uint32_t result =
-      xe::kernel::xboxkrnl::xeRtlNtStatusToDosError(error_code);
-  XThread::SetLastError(result);
-
-  return result;
+dword_result_t XamIsCurrentTitleDash_entry(const ppc_context_t& ctx) {
+  return ctx->kernel_state->title_id() == 0xFFFE07D1;
 }
-DECLARE_XAM_EXPORT1(RtlSetLastNTError, kNone, kImplemented);
-
-dword_result_t RtlGetLastError_entry() { return XThread::GetLastError(); }
-DECLARE_XAM_EXPORT1(RtlGetLastError, kNone, kImplemented);
-
-dword_result_t GetLastError_entry() { return RtlGetLastError_entry(); }
-DECLARE_XAM_EXPORT1(GetLastError, kNone, kImplemented);
-
-dword_result_t GetModuleHandleA_entry(lpstring_t module_name) {
-  xe::be<uint32_t> module_ptr = 0;
-  const X_STATUS error_code = xe::kernel::xboxkrnl::XexGetModuleHandle(
-      module_name.value(), &module_ptr);
-
-  if (XFAILED(error_code)) {
-    RtlSetLastNTError_entry(error_code);
-
-    return NULL;
-  }
-
-  return (uint32_t)module_ptr;
-}
-DECLARE_XAM_EXPORT1(GetModuleHandleA, kNone, kImplemented);
-
-dword_result_t XapipCreateThread_entry(lpdword_t lpThreadAttributes,
-                                       dword_t dwStackSize,
-                                       lpvoid_t lpStartAddress,
-                                       lpvoid_t lpParameter,
-                                       dword_t dwCreationFlags, dword_t unkn,
-                                       lpdword_t lpThreadId) {
-  uint32_t flags = (dwCreationFlags >> 2) & 1;
-
-  if (unkn != -1) {
-    flags |= 1 << unkn << 24;
-  }
-
-  xe::be<uint32_t> result = 0;
-
-  const X_STATUS error_code = xe::kernel::xboxkrnl::ExCreateThread(
-      &result, dwStackSize, lpThreadId, lpStartAddress, lpParameter, 0, flags);
-
-  if (XFAILED(error_code)) {
-    RtlSetLastNTError_entry(error_code);
-
-    return NULL;
-  }
-
-  return (uint32_t)result;
-}
-DECLARE_XAM_EXPORT1(XapipCreateThread, kNone, kImplemented);
-
-dword_result_t CreateThread_entry(lpdword_t lpThreadAttributes,
-                                  dword_t dwStackSize, lpvoid_t lpStartAddress,
-                                  lpvoid_t lpParameter, dword_t dwCreationFlags,
-                                  lpdword_t lpThreadId) {
-  return XapipCreateThread_entry(lpThreadAttributes, dwStackSize,
-                                 lpStartAddress, lpParameter, dwCreationFlags,
-                                 -1, lpThreadId);
-}
-DECLARE_XAM_EXPORT1(CreateThread, kNone, kImplemented);
-
-dword_result_t CloseHandle_entry(dword_t hObject) {
-  const X_STATUS error_code = xe::kernel::xboxkrnl::NtClose(hObject);
-
-  if (XFAILED(error_code)) {
-    RtlSetLastNTError_entry(error_code);
-
-    return false;
-  }
-
-  return true;
-}
-DECLARE_XAM_EXPORT1(CloseHandle, kNone, kImplemented);
-
-dword_result_t ResumeThread_entry(dword_t hThread) {
-  uint32_t suspend_count;
-  const X_STATUS error_code =
-      xe::kernel::xboxkrnl::NtResumeThread(hThread, &suspend_count);
-
-  if (XFAILED(error_code)) {
-    RtlSetLastNTError_entry(error_code);
-
-    return -1;
-  }
-
-  return suspend_count;
-}
-DECLARE_XAM_EXPORT1(ResumeThread, kNone, kImplemented);
-
-void ExitThread_entry(dword_t exit_code) {
-  xe::kernel::xboxkrnl::ExTerminateThread(exit_code);
-}
-DECLARE_XAM_EXPORT1(ExitThread, kNone, kImplemented);
-
-dword_result_t GetCurrentThreadId_entry() {
-  return XThread::GetCurrentThread()->GetCurrentThreadId();
-}
-DECLARE_XAM_EXPORT1(GetCurrentThreadId, kNone, kImplemented);
-
-qword_result_t XapiFormatTimeOut_entry(lpqword_t result,
-                                       dword_t dwMilliseconds) {
-  LARGE_INTEGER delay{};
-
-  // Convert the delay time to 100-nanosecond intervals
-  delay.QuadPart =
-      dwMilliseconds == -1 ? 0 : static_cast<LONGLONG>(-10000) * dwMilliseconds;
-
-  return (uint64_t)&delay;
-}
-DECLARE_XAM_EXPORT1(XapiFormatTimeOut, kNone, kImplemented);
-
-dword_result_t WaitForSingleObjectEx_entry(dword_t hHandle,
-                                           dword_t dwMilliseconds,
-                                           dword_t bAlertable) {
-  uint64_t* timeout = nullptr;
-  uint64_t timeout_ptr = XapiFormatTimeOut_entry(timeout, dwMilliseconds);
-
-  X_STATUS result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
-      hHandle, 1, bAlertable, &timeout_ptr);
-
-  while (bAlertable && result == X_STATUS_ALERTED) {
-    result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
-        hHandle, 1, bAlertable, &timeout_ptr);
-  }
-
-  RtlSetLastNTError_entry(result);
-  result = -1;
-
-  return result;
-}
-DECLARE_XAM_EXPORT1(WaitForSingleObjectEx, kNone, kImplemented);
-
-dword_result_t WaitForSingleObject_entry(dword_t hHandle,
-                                         dword_t dwMilliseconds) {
-  return WaitForSingleObjectEx_entry(hHandle, dwMilliseconds, 0);
-}
-DECLARE_XAM_EXPORT1(WaitForSingleObject, kNone, kImplemented);
-
-dword_result_t lstrlenW_entry(lpu16string_t string) {
-  // wcslen?
-  if (string) {
-    return (uint32_t)string.value().length();
-  }
-
-  return NULL;
-}
-DECLARE_XAM_EXPORT1(lstrlenW, kNone, kImplemented);
+DECLARE_XAM_EXPORT1(XamIsCurrentTitleDash, kNone, kImplemented);

 dword_result_t XamGetExecutionId_entry(lpdword_t info_ptr) {
  auto module = kernel_state()->GetExecutableModule();
@ -611,16 +424,204 @@ dword_result_t XamQueryLiveHiveW_entry(lpu16string_t name, lpvoid_t out_buf,
 }
 DECLARE_XAM_EXPORT1(XamQueryLiveHiveW, kNone, kStub);

-dword_result_t XamIsCurrentTitleDash_entry(const ppc_context_t& ctx) {
-  return ctx->kernel_state->title_id() == 0xFFFE07D1;
+// http://www.noxa.org/blog/2011/02/28/building-an-xbox-360-emulator-part-3-feasibilityos/
+// http://www.noxa.org/blog/2011/08/13/building-an-xbox-360-emulator-part-5-xex-files/
+dword_result_t RtlSleep_entry(dword_t dwMilliseconds, dword_t bAlertable) {
+  LARGE_INTEGER delay{};
+
+  // Convert the delay time to 100-nanosecond intervals
+  delay.QuadPart = dwMilliseconds == -1
+                       ? LLONG_MAX
+                       : static_cast<LONGLONG>(-10000) * dwMilliseconds;
+
+  X_STATUS result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
+                                                     (uint64_t*)&delay);
+
+  // If the delay was interrupted by an APC, keep delaying the thread
+  while (bAlertable && result == X_STATUS_ALERTED) {
+    result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
+                                              (uint64_t*)&delay);
+  }
+
+  return result == X_STATUS_SUCCESS ? X_STATUS_SUCCESS : X_STATUS_USER_APC;
 }
-DECLARE_XAM_EXPORT1(XamIsCurrentTitleDash, kNone, kImplemented);
+DECLARE_XAM_EXPORT1(RtlSleep, kNone, kImplemented);
+
+dword_result_t SleepEx_entry(dword_t dwMilliseconds, dword_t bAlertable) {
+  return RtlSleep_entry(dwMilliseconds, bAlertable);
+}
+DECLARE_XAM_EXPORT1(SleepEx, kNone, kImplemented);
+
+// https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-sleep
+void Sleep_entry(dword_t dwMilliseconds) {
+  RtlSleep_entry(dwMilliseconds, FALSE);
+}
+DECLARE_XAM_EXPORT1(Sleep, kNone, kImplemented);
+
+// https://learn.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-gettickcount
+dword_result_t GetTickCount_entry() { return Clock::QueryGuestUptimeMillis(); }
+DECLARE_XAM_EXPORT1(GetTickCount, kNone, kImplemented);
+
+dword_result_t RtlSetLastNTError_entry(dword_t error_code) {
+  const uint32_t result =
+      xe::kernel::xboxkrnl::xeRtlNtStatusToDosError(error_code);
+  XThread::SetLastError(result);
+
+  return result;
+}
+DECLARE_XAM_EXPORT1(RtlSetLastNTError, kNone, kImplemented);
+
+dword_result_t RtlGetLastError_entry() { return XThread::GetLastError(); }
+DECLARE_XAM_EXPORT1(RtlGetLastError, kNone, kImplemented);
+
+dword_result_t GetLastError_entry() { return RtlGetLastError_entry(); }
+DECLARE_XAM_EXPORT1(GetLastError, kNone, kImplemented);
+
+dword_result_t GetModuleHandleA_entry(lpstring_t module_name) {
+  xe::be<uint32_t> module_ptr = 0;
+  const X_STATUS error_code = xe::kernel::xboxkrnl::XexGetModuleHandle(
+      module_name.value(), &module_ptr);
+
+  if (XFAILED(error_code)) {
+    RtlSetLastNTError_entry(error_code);
+
+    return NULL;
+  }
+
+  return (uint32_t)module_ptr;
+}
+DECLARE_XAM_EXPORT1(GetModuleHandleA, kNone, kImplemented);
+
+dword_result_t XapipCreateThread_entry(lpdword_t lpThreadAttributes,
+                                       dword_t dwStackSize,
+                                       lpvoid_t lpStartAddress,
+                                       lpvoid_t lpParameter,
+                                       dword_t dwCreationFlags, dword_t unkn,
+                                       lpdword_t lpThreadId) {
+  uint32_t flags = (dwCreationFlags >> 2) & 1;
+
+  if (unkn != -1) {
+    flags |= 1 << unkn << 24;
+  }
+
+  xe::be<uint32_t> result = 0;
+
+  const X_STATUS error_code = xe::kernel::xboxkrnl::ExCreateThread(
+      &result, dwStackSize, lpThreadId, lpStartAddress, lpParameter, 0, flags);
+
+  if (XFAILED(error_code)) {
+    RtlSetLastNTError_entry(error_code);
+
+    return NULL;
+  }
+
+  return (uint32_t)result;
+}
+DECLARE_XAM_EXPORT1(XapipCreateThread, kNone, kImplemented);
+
+dword_result_t CreateThread_entry(lpdword_t lpThreadAttributes,
+                                  dword_t dwStackSize, lpvoid_t lpStartAddress,
+                                  lpvoid_t lpParameter, dword_t dwCreationFlags,
+                                  lpdword_t lpThreadId) {
+  return XapipCreateThread_entry(lpThreadAttributes, dwStackSize,
+                                 lpStartAddress, lpParameter, dwCreationFlags,
+                                 -1, lpThreadId);
+}
+DECLARE_XAM_EXPORT1(CreateThread, kNone, kImplemented);
+
+dword_result_t CloseHandle_entry(dword_t hObject) {
+  const X_STATUS error_code = xe::kernel::xboxkrnl::NtClose(hObject);
+
+  if (XFAILED(error_code)) {
+    RtlSetLastNTError_entry(error_code);
+
+    return false;
+  }
+
+  return true;
+}
+DECLARE_XAM_EXPORT1(CloseHandle, kNone, kImplemented);
+
+dword_result_t ResumeThread_entry(dword_t hThread) {
+  uint32_t suspend_count;
+  const X_STATUS error_code =
+      xe::kernel::xboxkrnl::NtResumeThread(hThread, &suspend_count);
+
+  if (XFAILED(error_code)) {
+    RtlSetLastNTError_entry(error_code);
+
+    return -1;
+  }
+
+  return suspend_count;
+}
+DECLARE_XAM_EXPORT1(ResumeThread, kNone, kImplemented);
+
+void ExitThread_entry(dword_t exit_code) {
+  xe::kernel::xboxkrnl::ExTerminateThread(exit_code);
+}
+DECLARE_XAM_EXPORT1(ExitThread, kNone, kImplemented);
+
+dword_result_t GetCurrentThreadId_entry() {
+  return XThread::GetCurrentThread()->GetCurrentThreadId();
+}
+DECLARE_XAM_EXPORT1(GetCurrentThreadId, kNone, kImplemented);
+
+qword_result_t XapiFormatTimeOut_entry(lpqword_t result,
+                                       dword_t dwMilliseconds) {
+  LARGE_INTEGER delay{};
+
+  // Convert the delay time to 100-nanosecond intervals
+  delay.QuadPart =
+      dwMilliseconds == -1 ? 0 : static_cast<LONGLONG>(-10000) * dwMilliseconds;
+
+  return (uint64_t)&delay;
+}
+DECLARE_XAM_EXPORT1(XapiFormatTimeOut, kNone, kImplemented);
+
+dword_result_t WaitForSingleObjectEx_entry(dword_t hHandle,
+                                           dword_t dwMilliseconds,
+                                           dword_t bAlertable) {
+  uint64_t* timeout = nullptr;
+  uint64_t timeout_ptr = XapiFormatTimeOut_entry(timeout, dwMilliseconds);
+
+  X_STATUS result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
+      hHandle, 1, bAlertable, &timeout_ptr);
+
+  while (bAlertable && result == X_STATUS_ALERTED) {
+    result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
+        hHandle, 1, bAlertable, &timeout_ptr);
+  }
+
+  RtlSetLastNTError_entry(result);
+  result = -1;
+
+  return result;
+}
+DECLARE_XAM_EXPORT1(WaitForSingleObjectEx, kNone, kImplemented);
+
+dword_result_t WaitForSingleObject_entry(dword_t hHandle,
+                                         dword_t dwMilliseconds) {
+  return WaitForSingleObjectEx_entry(hHandle, dwMilliseconds, 0);
+}
+DECLARE_XAM_EXPORT1(WaitForSingleObject, kNone, kImplemented);
+
+dword_result_t lstrlenW_entry(lpu16string_t string) {
+  // wcslen?
+  if (string) {
+    return (uint32_t)string.value().length();
+  }
+
+  return NULL;
+}
+DECLARE_XAM_EXPORT1(lstrlenW, kNone, kImplemented);

 dword_result_t XGetAudioFlags_entry() { return 65537; }
 DECLARE_XAM_EXPORT1(XGetAudioFlags, kNone, kStub);

 /*
-	todo: this table should instead be pointed to by a member of kernel state and initialized along with the process
+        todo: this table should instead be pointed to by a member of kernel
+   state and initialized along with the process
 */
 static int32_t XamRtlRandomTable[128] = {
    1284227242, 1275210071, 573735546,  790525478,  2139871995, 1547161642,