[GPU/D3D12] Memexport from anywhere in control flow + 8/16bpp memexport

There's no limit on the number of memory exports in a shader on the real Xenos, and exports can be done anywhere, including in loops. Now, instead of deferring the exports to the end of the shader, and assuming that export allocs are executed only once, Xenia flushes exports when it reaches an alloc (allocs terminate memory exports on Xenos, as well as individual ALU instructions with `serialize`, but not handling this case for simplicity, it's only truly mandatory to flush memory exports before starting a new one), the end of the shader, or a pixel with outstanding exports is killed. To know which eM# registers need to be flushed to the memory, traversing the successors of each exec potentially writing any eM#, and specifying that certain eM# registers might have potentially been written before each reached control flow instruction, until a flush point or the end of the shader is reached. Also, some games export to sub-32bpp formats. These are now supported via atomic AND clearing the bits of the dword to replace followed by an atomic OR inserting the new byte/short.
2023-05-05 21:05:23 +03:00 · 2023-05-05 21:05:23 +03:00 · 53f98d1fe6
parent 8aaa6f1f7d
commit 53f98d1fe6
17 changed files with 1437 additions and 849 deletions
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@ -2125,7 +2125,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
    return false;
  }
  pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
-  bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
+  bool memexport_used_vertex = vertex_shader->memexport_eM_written();

  // Pixel shader analysis.
  bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
@ -2154,7 +2154,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
    }
  }
  bool memexport_used_pixel =
-      pixel_shader && pixel_shader->is_valid_memexport_used();
+      pixel_shader && pixel_shader->memexport_eM_written();
  bool memexport_used = memexport_used_vertex || memexport_used_pixel;

  if (!BeginSubmission(true)) {
@ -2341,100 +2341,20 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
  // Gather memexport ranges and ensure the heaps for them are resident, and
  // also load the data surrounding the export and to fill the regions that
  // won't be modified by the shaders.
-  struct MemExportRange {
-    uint32_t base_address_dwords;
-    uint32_t size_dwords;
-  };
-  MemExportRange memexport_ranges[512];
-  uint32_t memexport_range_count = 0;
+  memexport_ranges_.clear();
  if (memexport_used_vertex) {
-    for (uint32_t constant_index :
-         vertex_shader->memexport_stream_constants()) {
-      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
-          XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4);
-      if (memexport_stream.index_count == 0) {
-        continue;
-      }
-      uint32_t memexport_format_size =
-          GetSupportedMemExportFormatSize(memexport_stream.format);
-      if (memexport_format_size == 0) {
-        XELOGE("Unsupported memexport format {}",
-               FormatInfo::Get(
-                   xenos::TextureFormat(uint32_t(memexport_stream.format)))
-                   ->name);
-        return false;
-      }
-      uint32_t memexport_size_dwords =
-          memexport_stream.index_count * memexport_format_size;
-      // Try to reduce the number of shared memory operations when writing
-      // different elements into the same buffer through different exports
-      // (happens in 4D5307E6).
-      bool memexport_range_reused = false;
-      for (uint32_t i = 0; i < memexport_range_count; ++i) {
-        MemExportRange& memexport_range = memexport_ranges[i];
-        if (memexport_range.base_address_dwords ==
-            memexport_stream.base_address) {
-          memexport_range.size_dwords =
-              std::max(memexport_range.size_dwords, memexport_size_dwords);
-          memexport_range_reused = true;
-          break;
-        }
-      }
-      // Add a new range if haven't expanded an existing one.
-      if (!memexport_range_reused) {
-        MemExportRange& memexport_range =
-            memexport_ranges[memexport_range_count++];
-        memexport_range.base_address_dwords = memexport_stream.base_address;
-        memexport_range.size_dwords = memexport_size_dwords;
-      }
-    }
+    draw_util::AddMemExportRanges(regs, *vertex_shader, memexport_ranges_);
  }
  if (memexport_used_pixel) {
-    for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) {
-      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
-          XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4);
-      if (memexport_stream.index_count == 0) {
-        continue;
+    draw_util::AddMemExportRanges(regs, *pixel_shader, memexport_ranges_);
  }
-      uint32_t memexport_format_size =
-          GetSupportedMemExportFormatSize(memexport_stream.format);
-      if (memexport_format_size == 0) {
-        XELOGE("Unsupported memexport format {}",
-               FormatInfo::Get(
-                   xenos::TextureFormat(uint32_t(memexport_stream.format)))
-                   ->name);
-        return false;
-      }
-      uint32_t memexport_size_dwords =
-          memexport_stream.index_count * memexport_format_size;
-      bool memexport_range_reused = false;
-      for (uint32_t i = 0; i < memexport_range_count; ++i) {
-        MemExportRange& memexport_range = memexport_ranges[i];
-        if (memexport_range.base_address_dwords ==
-            memexport_stream.base_address) {
-          memexport_range.size_dwords =
-              std::max(memexport_range.size_dwords, memexport_size_dwords);
-          memexport_range_reused = true;
-          break;
-        }
-      }
-      if (!memexport_range_reused) {
-        MemExportRange& memexport_range =
-            memexport_ranges[memexport_range_count++];
-        memexport_range.base_address_dwords = memexport_stream.base_address;
-        memexport_range.size_dwords = memexport_size_dwords;
-      }
-    }
-  }
-  for (uint32_t i = 0; i < memexport_range_count; ++i) {
-    const MemExportRange& memexport_range = memexport_ranges[i];
+  for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
    if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
-                                      memexport_range.size_dwords << 2)) {
+                                      memexport_range.size_bytes)) {
      XELOGE(
          "Failed to request memexport stream at 0x{:08X} (size {}) in the "
          "shared memory",
-          memexport_range.base_address_dwords << 2,
-          memexport_range.size_dwords << 2);
+          memexport_range.base_address_dwords << 2, memexport_range.size_bytes);
      return false;
    }
  }
@ -2594,17 +2514,17 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
    // when memexports should be awaited?
    shared_memory_->MarkUAVWritesCommitNeeded();
    // Invalidate textures in memexported memory and watch for changes.
-    for (uint32_t i = 0; i < memexport_range_count; ++i) {
-      const MemExportRange& memexport_range = memexport_ranges[i];
+    for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
      shared_memory_->RangeWrittenByGpu(
-          memexport_range.base_address_dwords << 2,
-          memexport_range.size_dwords << 2, false);
+          memexport_range.base_address_dwords << 2, memexport_range.size_bytes,
+          false);
    }
    if (cvars::d3d12_readback_memexport) {
      // Read the exported data on the CPU.
      uint32_t memexport_total_size = 0;
-      for (uint32_t i = 0; i < memexport_range_count; ++i) {
-        memexport_total_size += memexport_ranges[i].size_dwords << 2;
+      for (const draw_util::MemExportRange& memexport_range :
+           memexport_ranges_) {
+        memexport_total_size += memexport_range.size_bytes;
      }
      if (memexport_total_size != 0) {
        ID3D12Resource* readback_buffer =
@ -2614,9 +2534,9 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
          SubmitBarriers();
          ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
          uint32_t readback_buffer_offset = 0;
-          for (uint32_t i = 0; i < memexport_range_count; ++i) {
-            const MemExportRange& memexport_range = memexport_ranges[i];
-            uint32_t memexport_range_size = memexport_range.size_dwords << 2;
+          for (const draw_util::MemExportRange& memexport_range :
+               memexport_ranges_) {
+            uint32_t memexport_range_size = memexport_range.size_bytes;
            deferred_command_list_.D3DCopyBufferRegion(
                readback_buffer, readback_buffer_offset, shared_memory_buffer,
                memexport_range.base_address_dwords << 2, memexport_range_size);
@ -2629,14 +2549,14 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
            void* readback_mapping;
            if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
                                               &readback_mapping))) {
-              const uint32_t* readback_dwords =
-                  reinterpret_cast<const uint32_t*>(readback_mapping);
-              for (uint32_t i = 0; i < memexport_range_count; ++i) {
-                const MemExportRange& memexport_range = memexport_ranges[i];
+              const uint8_t* readback_bytes =
+                  reinterpret_cast<const uint8_t*>(readback_mapping);
+              for (const draw_util::MemExportRange& memexport_range :
+                   memexport_ranges_) {
                std::memcpy(memory_->TranslatePhysical(
                                memexport_range.base_address_dwords << 2),
-                            readback_dwords, memexport_range.size_dwords << 2);
-                readback_dwords += memexport_range.size_dwords;
+                            readback_bytes, memexport_range.size_bytes);
+                readback_bytes += memexport_range.size_bytes;
              }
              D3D12_RANGE readback_write_range = {};
              readback_buffer->Unmap(0, &readback_write_range);
@ -4510,36 +4430,6 @@ bool D3D12CommandProcessor::UpdateBindings(const D3D12Shader* vertex_shader,
  return true;
 }

-uint32_t D3D12CommandProcessor::GetSupportedMemExportFormatSize(
-    xenos::ColorFormat format) {
-  switch (format) {
-    case xenos::ColorFormat::k_8_8_8_8:
-    case xenos::ColorFormat::k_2_10_10_10:
-    // TODO(Triang3l): Investigate how k_8_8_8_8_A works - not supported in the
-    // texture cache currently.
-    // case xenos::ColorFormat::k_8_8_8_8_A:
-    case xenos::ColorFormat::k_10_11_11:
-    case xenos::ColorFormat::k_11_11_10:
-    case xenos::ColorFormat::k_16_16:
-    case xenos::ColorFormat::k_16_16_FLOAT:
-    case xenos::ColorFormat::k_32_FLOAT:
-    case xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16:
-    case xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16:
-    case xenos::ColorFormat::k_10_11_11_AS_16_16_16_16:
-    case xenos::ColorFormat::k_11_11_10_AS_16_16_16_16:
-      return 1;
-    case xenos::ColorFormat::k_16_16_16_16:
-    case xenos::ColorFormat::k_16_16_16_16_FLOAT:
-    case xenos::ColorFormat::k_32_32_FLOAT:
-      return 2;
-    case xenos::ColorFormat::k_32_32_32_32_FLOAT:
-      return 4;
-    default:
-      break;
-  }
-  return 0;
-}
-
 ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
  if (size == 0) {
    return nullptr;
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <utility>
+#include <vector>

 #include "xenia/base/assert.h"
 #include "xenia/gpu/command_processor.h"
@ -378,13 +379,6 @@ class D3D12CommandProcessor : public CommandProcessor {
                      ID3D12RootSignature* root_signature,
                      bool shared_memory_is_uav);

-  // Returns dword count for one element for a memexport format, or 0 if it's
-  // not supported by the D3D12 command processor (if it's smaller that 1 dword,
-  // for instance).
-  // TODO(Triang3l): Check if any game uses memexport with formats smaller than
-  // 32 bits per element.
-  static uint32_t GetSupportedMemExportFormatSize(xenos::ColorFormat format);
-
  // Returns a buffer for reading GPU data back to the CPU. Assuming
  // synchronizing immediately after use. Always in COPY_DEST state.
  ID3D12Resource* RequestReadbackBuffer(uint32_t size);
@ -684,6 +678,9 @@ class D3D12CommandProcessor : public CommandProcessor {

  // Current primitive topology.
  D3D_PRIMITIVE_TOPOLOGY primitive_topology_;
+
+  // Temporary storage for memexport stream constants used in the draw.
+  std::vector<draw_util::MemExportRange> memexport_ranges_;
 };

 }  // namespace d3d12
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Copyright 2023 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -141,7 +141,7 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
  //
  // Memory export is an obvious intentional side effect.
  if (shader.kills_pixels() || shader.writes_depth() ||
-      shader.is_valid_memexport_used() ||
+      shader.memexport_eM_written() ||
      (shader.writes_color_target(0) &&
       DoesCoverageDependOnAlpha(regs.Get<reg::RB_COLORCONTROL>()))) {
    return true;
@ -651,6 +651,65 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
  return normalized_color_mask;
 }

+void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
+                        std::vector<MemExportRange>& ranges_out) {
+  if (!shader.memexport_eM_written()) {
+    // The shader has eA writes, but no real exports.
+    return;
+  }
+  uint32_t float_constants_base = shader.type() == xenos::ShaderType::kVertex
+                                      ? regs.Get<reg::SQ_VS_CONST>().base
+                                      : regs.Get<reg::SQ_PS_CONST>().base;
+  for (uint32_t constant_index : shader.memexport_stream_constants()) {
+    const auto& stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
+        XE_GPU_REG_SHADER_CONSTANT_000_X +
+        (float_constants_base + constant_index) * 4);
+    if (!stream.index_count) {
+      continue;
+    }
+    const FormatInfo& format_info =
+        *FormatInfo::Get(xenos::TextureFormat(stream.format));
+    if (format_info.type != FormatType::kResolvable) {
+      XELOGE("Unsupported memexport format {}", format_info.name);
+      // Translated shaders shouldn't be performing exports with an unknown
+      // format, the draw can still be performed.
+      continue;
+    }
+    // TODO(Triang3l): Remove the unresearched format logging when it's known
+    // how exactly these formats need to be handled (most importantly what
+    // components need to be stored and in which order).
+    switch (stream.format) {
+      case xenos::ColorFormat::k_8_A:
+      case xenos::ColorFormat::k_8_B:
+      case xenos::ColorFormat::k_8_8_8_8_A:
+        XELOGW(
+            "Memexport done to an unresearched format {}, report the game to "
+            "Xenia developers!",
+            format_info.name);
+        break;
+      default:
+        break;
+    }
+    uint32_t stream_size_bytes =
+        stream.index_count * (format_info.bits_per_pixel >> 3);
+    // Try to reduce the number of shared memory operations when writing
+    // different elements into the same buffer through different exports
+    // (happens in 4D5307E6).
+    bool range_reused = false;
+    for (MemExportRange& range : ranges_out) {
+      if (range.base_address_dwords == stream.base_address) {
+        range.size_bytes = std::max(range.size_bytes, stream_size_bytes);
+        range_reused = true;
+        break;
+      }
+    }
+    // Add a new range if haven't expanded an existing one.
+    if (!range_reused) {
+      ranges_out.emplace_back(stream.base_address, stream_size_bytes);
+    }
+  }
+}
+
 xenos::CopySampleSelect SanitizeCopySampleSelect(
    xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
    bool is_depth) {
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@ -13,6 +13,7 @@
 #include <cmath>
 #include <cstdint>
 #include <utility>
+#include <vector>

 #include "xenia/base/assert.h"
 #include "xenia/gpu/register_file.h"
@ -330,6 +331,19 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA(
  return guest_sample_index ? 3 : 0;
 }

+struct MemExportRange {
+  uint32_t base_address_dwords;
+  uint32_t size_bytes;
+
+  explicit MemExportRange(uint32_t base_address_dwords, uint32_t size_bytes)
+      : base_address_dwords(base_address_dwords), size_bytes(size_bytes) {}
+};
+
+// Gathers memory ranges involved in memexports in the shader with the float
+// constants from the registers, adding them to ranges_out.
+void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
+                        std::vector<MemExportRange>& ranges_out);
+
 // To avoid passing values that the shader won't understand (even though
 // Direct3D 9 shouldn't pass them anyway).
 xenos::CopySampleSelect SanitizeCopySampleSelect(
--- a/src/xenia/gpu/dxbc.h
+++ b/src/xenia/gpu/dxbc.h
@ -913,6 +913,8 @@ enum class OperandModifier : uint32_t {

 struct Dest : OperandAddress {
  // Ignored for 0-component and 1-component operand types.
+  // For 4-component operand types, if the write mask is 0, it's treated as
+  // 0-component.
  uint32_t write_mask_;

  // Input destinations (v*) are for use only in declarations. Vector input
@ -1028,12 +1030,16 @@ struct Dest : OperandAddress {
  void Write(std::vector<uint32_t>& code, bool in_dcl = false) const {
    uint32_t operand_token = GetOperandTokenTypeAndIndex();
    OperandDimension dimension = GetDimension(in_dcl);
-    operand_token |= uint32_t(dimension);
    if (dimension == OperandDimension::kVector) {
-      assert_true(write_mask_ > 0b0000 && write_mask_ <= 0b1111);
+      if (write_mask_) {
+        assert_true(write_mask_ <= 0b1111);
        operand_token |=
            (uint32_t(ComponentSelection::kMask) << 2) | (write_mask_ << 4);
+      } else {
+        dimension = OperandDimension::kNoData;
      }
+    }
+    operand_token |= uint32_t(dimension);
    code.push_back(operand_token);
    OperandAddress::Write(code);
  }
@ -1507,6 +1513,8 @@ enum class Opcode : uint32_t {
  kStoreUAVTyped = 164,
  kLdRaw = 165,
  kStoreRaw = 166,
+  kAtomicAnd = 169,
+  kAtomicOr = 170,
  kEvalSampleIndex = 204,
  kEvalCentroid = 205,
 };
@ -2395,6 +2403,14 @@ class Assembler {
    ++stat_.instruction_count;
    ++stat_.c_texture_store_instructions;
  }
+  void OpAtomicAnd(const Dest& dest, const Src& address,
+                   uint32_t address_components, const Src& value) {
+    EmitAtomicOp(Opcode::kAtomicAnd, dest, address, address_components, value);
+  }
+  void OpAtomicOr(const Dest& dest, const Src& address,
+                  uint32_t address_components, const Src& value) {
+    EmitAtomicOp(Opcode::kAtomicOr, dest, address, address_components, value);
+  }
  void OpEvalSampleIndex(const Dest& dest, const Src& value,
                         const Src& sample_index) {
    uint32_t dest_write_mask = dest.GetMask();
@ -2521,6 +2537,22 @@ class Assembler {
    src1.Write(code_, true, 0b0000);
    ++stat_.instruction_count;
  }
+  void EmitAtomicOp(Opcode opcode, const Dest& dest, const Src& address,
+                    uint32_t address_components, const Src& value) {
+    // Atomic operations require a 0-component memory destination.
+    assert_zero(dest.GetMask());
+    uint32_t address_mask = (1 << address_components) - 1;
+    uint32_t operands_length = dest.GetLength() +
+                               address.GetLength(address_mask) +
+                               value.GetLength(0b0001);
+    code_.reserve(code_.size() + 1 + operands_length);
+    code_.push_back(OpcodeToken(opcode, operands_length));
+    dest.Write(code_);
+    address.Write(code_, true, address_mask);
+    value.Write(code_, true, 0b0001);
+    ++stat_.instruction_count;
+    ++stat_.c_interlocked_instructions;
+  }

  std::vector<uint32_t>& code_;
  Statistics& stat_;
--- a/src/xenia/gpu/dxbc_shader_translator.cc
+++ b/src/xenia/gpu/dxbc_shader_translator.cc
@ -177,8 +177,6 @@ void DxbcShaderTranslator::Reset() {

  sampler_bindings_.clear();

-  memexport_alloc_current_count_ = 0;
-
  std::memset(&shader_feature_info_, 0, sizeof(shader_feature_info_));
  std::memset(&statistics_, 0, sizeof(statistics_));
 }
@ -787,6 +785,63 @@ void DxbcShaderTranslator::StartPixelShader() {
      PopSystemTemp();
    }
  }
+
+  if (current_shader().memexport_eM_written()) {
+    // Make sure memexport is done only once for a guest pixel.
+    dxbc::Dest memexport_enabled_dest(
+        dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001));
+    dxbc::Src memexport_enabled_src(dxbc::Src::R(
+        system_temp_memexport_enabled_and_eM_written_, dxbc::Src::kXXXX));
+    uint32_t resolution_scaled_axes =
+        uint32_t(draw_resolution_scale_x_ > 1) |
+        (uint32_t(draw_resolution_scale_y_ > 1) << 1);
+    if (resolution_scaled_axes) {
+      uint32_t memexport_condition_temp = PushSystemTemp();
+      // Only do memexport for one host pixel in a guest pixel - prefer the
+      // host pixel closer to the center of the guest pixel, but one that's
+      // covered with the half-pixel offset according to the top-left rule (1
+      // for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x
+      // because it's the center and is covered with the half-pixel offset too).
+      in_position_used_ |= resolution_scaled_axes;
+      a_.OpFToU(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
+                dxbc::Src::V1D(in_reg_ps_position_));
+      a_.OpUDiv(dxbc::Dest::Null(),
+                dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
+                dxbc::Src::R(memexport_condition_temp),
+                dxbc::Src::LU(draw_resolution_scale_x_,
+                              draw_resolution_scale_y_, 0, 0));
+      a_.OpIEq(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
+               dxbc::Src::R(memexport_condition_temp),
+               dxbc::Src::LU(draw_resolution_scale_x_ >> 1,
+                             draw_resolution_scale_y_ >> 1, 0, 0));
+      for (uint32_t i = 0; i < 2; ++i) {
+        if (!(resolution_scaled_axes & (1 << i))) {
+          continue;
+        }
+        a_.OpAnd(memexport_enabled_dest, memexport_enabled_src,
+                 dxbc::Src::R(memexport_condition_temp).Select(i));
+      }
+      // Release memexport_condition_temp.
+      PopSystemTemp();
+    }
+    // With sample-rate shading (with float24 conversion), only do memexport
+    // from one sample (as the shader is invoked multiple times for a pixel),
+    // if SV_SampleIndex == firstbit_lo(SV_Coverage). For zero coverage,
+    // firstbit_lo returns 0xFFFFFFFF.
+    if (IsSampleRate()) {
+      uint32_t memexport_condition_temp = PushSystemTemp();
+      a_.OpFirstBitLo(dxbc::Dest::R(memexport_condition_temp, 0b0001),
+                      dxbc::Src::VCoverage());
+      a_.OpIEq(
+          dxbc::Dest::R(memexport_condition_temp, 0b0001),
+          dxbc::Src::V1D(in_reg_ps_front_face_sample_index_, dxbc::Src::kYYYY),
+          dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX));
+      a_.OpAnd(memexport_enabled_dest, memexport_enabled_src,
+               dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX));
+      // Release memexport_condition_temp.
+      PopSystemTemp();
+    }
+  }
 }

 void DxbcShaderTranslator::StartTranslation() {
@ -883,34 +938,27 @@ void DxbcShaderTranslator::StartTranslation() {
    }
  }

-  if (!is_depth_only_pixel_shader_) {
-    // Allocate temporary registers for memexport addresses and data.
-    std::memset(system_temps_memexport_address_, 0xFF,
-                sizeof(system_temps_memexport_address_));
-    std::memset(system_temps_memexport_data_, 0xFF,
-                sizeof(system_temps_memexport_data_));
-    system_temp_memexport_written_ = UINT32_MAX;
-    const uint8_t* memexports_written = current_shader().memexport_eM_written();
-    for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) {
-      uint32_t memexport_alloc_written = memexports_written[i];
-      if (memexport_alloc_written == 0) {
-        continue;
-      }
-      // If memexport is used at all, allocate a register containing whether eM#
-      // have actually been written to.
-      if (system_temp_memexport_written_ == UINT32_MAX) {
-        system_temp_memexport_written_ = PushSystemTemp(0b1111);
-      }
-      system_temps_memexport_address_[i] = PushSystemTemp(0b1111);
-      uint32_t memexport_data_index;
-      while (xe::bit_scan_forward(memexport_alloc_written,
-                                  &memexport_data_index)) {
-        memexport_alloc_written &= ~(1u << memexport_data_index);
-        system_temps_memexport_data_[i][memexport_data_index] =
-            PushSystemTemp();
+  // Allocate temporary registers for memexport.
+  uint8_t memexport_eM_written = current_shader().memexport_eM_written();
+  if (memexport_eM_written) {
+    system_temp_memexport_enabled_and_eM_written_ = PushSystemTemp(0b0010);
+    // Initialize the memexport conditional to whether the shared memory is
+    // currently bound as UAV (to 0 or UINT32_MAX). It can be made narrower
+    // later.
+    a_.OpIBFE(
+        dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001),
+        dxbc::Src::LU(1), dxbc::Src::LU(kSysFlag_SharedMemoryIsUAV_Shift),
+        LoadFlagsSystemConstant());
+    system_temp_memexport_address_ = PushSystemTemp(0b1111);
+    uint8_t memexport_eM_remaining = memexport_eM_written;
+    uint32_t memexport_eM_index;
+    while (xe::bit_scan_forward(memexport_eM_remaining, &memexport_eM_index)) {
+      memexport_eM_remaining &= ~(uint8_t(1) << memexport_eM_index);
+      system_temps_memexport_data_[memexport_eM_index] = PushSystemTemp(0b1111);
    }
  }

+  if (!is_depth_only_pixel_shader_) {
    // Allocate system temporary variables for the translated code. Since access
    // depends on the guest code (thus no guarantees), initialize everything
    // now (except for pv, it's an internal temporary variable, not accessible
@ -1089,27 +1137,19 @@ void DxbcShaderTranslator::CompleteShaderCode() {
    // - system_temp_grad_h_lod_.
    // - system_temp_grad_v_vfetch_address_.
    PopSystemTemp(6);
+  }

-    // Write memexported data to the shared memory UAV.
-    ExportToMemory();
+  uint8_t memexport_eM_written = current_shader().memexport_eM_written();
+  if (memexport_eM_written) {
+    // Write data for the last memexport.
+    ExportToMemory(
+        current_shader().memexport_eM_potentially_written_before_end());

-    // Release memexport temporary registers.
-    for (int i = Shader::kMaxMemExports - 1; i >= 0; --i) {
-      if (system_temps_memexport_address_[i] == UINT32_MAX) {
-        continue;
-      }
-      // Release exported data registers.
-      for (int j = 4; j >= 0; --j) {
-        if (system_temps_memexport_data_[i][j] != UINT32_MAX) {
-          PopSystemTemp();
-        }
-      }
-      // Release the address register.
-      PopSystemTemp();
-    }
-    if (system_temp_memexport_written_ != UINT32_MAX) {
-      PopSystemTemp();
-    }
+    // Release memexport temporary registers:
+    // - system_temp_memexport_enabled_and_eM_written_.
+    // - system_temp_memexport_address_.
+    // - system_temps_memexport_data_.
+    PopSystemTemp(xe::bit_count(uint32_t(memexport_eM_written)) + 2);
  }

  // Write stage-specific epilogue.
@ -1512,36 +1552,22 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
      dest = dxbc::Dest::R(system_temp_point_size_edge_flag_kill_vertex_);
      break;
    case InstructionStorageTarget::kExportAddress:
-      // Validate memexport writes (4D5307E6 has some completely invalid ones).
-      if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 ||
-          memexport_alloc_current_count_ > Shader::kMaxMemExports ||
-          system_temps_memexport_address_[memexport_alloc_current_count_ - 1] ==
-              UINT32_MAX) {
+      if (!current_shader().memexport_eM_written()) {
        return;
      }
-      dest = dxbc::Dest::R(
-          system_temps_memexport_address_[memexport_alloc_current_count_ - 1]);
+      dest = dxbc::Dest::R(system_temp_memexport_address_);
      break;
    case InstructionStorageTarget::kExportData: {
-      // Validate memexport writes (4D5307E6 has some completely invalid ones).
-      if (memexport_alloc_current_count_ == 0 ||
-          memexport_alloc_current_count_ > Shader::kMaxMemExports ||
-          system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
-                                      [result.storage_index] == UINT32_MAX) {
-        return;
-      }
-      dest = dxbc::Dest::R(
-          system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
-                                      [result.storage_index]);
+      assert_not_zero(current_shader().memexport_eM_written() &
+                      (uint8_t(1) << result.storage_index));
+      dest = dxbc::Dest::R(system_temps_memexport_data_[result.storage_index]);
      // Mark that the eM# has been written to and needs to be exported.
      assert_not_zero(used_write_mask);
-      uint32_t memexport_index = memexport_alloc_current_count_ - 1;
-      a_.OpOr(dxbc::Dest::R(system_temp_memexport_written_,
-                            1 << (memexport_index >> 2)),
-              dxbc::Src::R(system_temp_memexport_written_)
-                  .Select(memexport_index >> 2),
-              dxbc::Src::LU(uint32_t(1) << (result.storage_index +
-                                            ((memexport_index & 3) << 3))));
+      a_.OpOr(
+          dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010),
+          dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_,
+                       dxbc::Src::kYYYY),
+          dxbc::Src::LU(uint8_t(1) << result.storage_index));
    } break;
    case InstructionStorageTarget::kColor:
      assert_not_zero(used_write_mask);
@ -1988,15 +2014,38 @@ void DxbcShaderTranslator::ProcessJumpInstruction(
 }

 void DxbcShaderTranslator::ProcessAllocInstruction(
-    const ParsedAllocInstruction& instr) {
+    const ParsedAllocInstruction& instr, uint8_t export_eM) {
+  bool start_memexport = instr.type == AllocType::kMemory &&
+                         current_shader().memexport_eM_written();
+  if (export_eM || start_memexport) {
+    CloseExecConditionals();
+  }
+
  if (emit_source_map_) {
    instruction_disassembly_buffer_.Reset();
    instr.Disassemble(&instruction_disassembly_buffer_);
    EmitInstructionDisassembly();
  }

-  if (instr.type == AllocType::kMemory) {
-    ++memexport_alloc_current_count_;
+  if (export_eM) {
+    ExportToMemory(export_eM);
+    // Reset which eM# elements have been written.
+    a_.OpMov(
+        dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010),
+        dxbc::Src::LU(0));
+    // Break dependencies from the previous memexport.
+    uint8_t export_eM_remaining = export_eM;
+    uint32_t eM_index;
+    while (xe::bit_scan_forward(export_eM_remaining, &eM_index)) {
+      export_eM_remaining &= ~(uint8_t(1) << eM_index);
+      a_.OpMov(dxbc::Dest::R(system_temps_memexport_data_[eM_index]),
+               dxbc::Src::LF(0.0f));
+    }
+  }
+
+  if (start_memexport) {
+    // Initialize eA to an invalid address.
+    a_.OpMov(dxbc::Dest::R(system_temp_memexport_address_), dxbc::Src::LU(0));
  }
 }

@ -2849,7 +2898,7 @@ void DxbcShaderTranslator::WriteInputSignature() {
    // Sample index (SV_SampleIndex) for safe memexport with sample-rate
    // shading.
    size_t sample_index_position = SIZE_MAX;
-    if (current_shader().is_valid_memexport_used() && IsSampleRate()) {
+    if (current_shader().memexport_eM_written() && IsSampleRate()) {
      size_t sample_index_position = shader_object_.size();
      shader_object_.resize(shader_object_.size() + kParameterDwords);
      ++parameter_count;
@ -3623,7 +3672,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
          dxbc::Name::kPosition);
    }
    bool sample_rate_memexport =
-        current_shader().is_valid_memexport_used() && IsSampleRate();
+        current_shader().memexport_eM_written() && IsSampleRate();
    // Sample-rate shading can't be done with UAV-only rendering (sample-rate
    // shading is only needed for float24 depth conversion when using a float32
    // host depth buffer).
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
@ -20,6 +20,7 @@
 #include "xenia/base/string_buffer.h"
 #include "xenia/gpu/dxbc.h"
 #include "xenia/gpu/shader_translator.h"
+#include "xenia/gpu/ucode.h"
 #include "xenia/ui/graphics_provider.h"

 namespace xe {
@ -589,13 +590,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
  void ProcessLoopEndInstruction(
      const ParsedLoopEndInstruction& instr) override;
  void ProcessJumpInstruction(const ParsedJumpInstruction& instr) override;
-  void ProcessAllocInstruction(const ParsedAllocInstruction& instr) override;
+  void ProcessAllocInstruction(const ParsedAllocInstruction& instr,
+                               uint8_t export_eM) override;

  void ProcessVertexFetchInstruction(
      const ParsedVertexFetchInstruction& instr) override;
  void ProcessTextureFetchInstruction(
      const ParsedTextureFetchInstruction& instr) override;
-  void ProcessAluInstruction(const ParsedAluInstruction& instr) override;
+  void ProcessAluInstruction(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before) override;

 private:
  // IF ANY OF THESE ARE CHANGED, WriteInputSignature and WriteOutputSignature
@ -674,6 +678,11 @@ class DxbcShaderTranslator : public ShaderTranslator {
  // Frees the last allocated internal r# registers for later reuse.
  void PopSystemTemp(uint32_t count = 1);

+  // ExportToMemory modifies the values of eA/eM# for simplicity, call only
+  // before starting a new export or ending the invocation or making it
+  // inactive.
+  void ExportToMemory(uint8_t export_eM);
+
  // Converts one scalar from piecewise linear gamma to linear. The target may
  // be the same as the source, the temporary variables must be different. If
  // the source is not pre-saturated, saturation will be done internally.
@ -728,7 +737,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
  bool ROV_IsDepthStencilEarly() const {
    assert_true(edram_rov_used_);
    return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() &&
-           !current_shader().is_valid_memexport_used();
+           !current_shader().memexport_eM_written();
  }
  // Converts the pre-clamped depth value to 24-bit (storing the result in bits
  // 0:23 and zeros in 24:31, not creating room for stencil - since this may be
@ -787,14 +796,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
  void StartPixelShader_LoadROVParameters();
  void StartPixelShader();

-  // Writing the epilogue.
-  // ExportToMemory modifies the values of eA/eM# for simplicity, don't call
-  // multiple times.
-  void ExportToMemory_PackFixed32(const uint32_t* eM_temps, uint32_t eM_count,
-                                  const uint32_t bits[4],
-                                  const dxbc::Src& is_integer,
-                                  const dxbc::Src& is_signed);
-  void ExportToMemory();
  void CompleteVertexOrDomainShader();
  // For RTV, adds the sample to coverage_temp.coverage_temp_component if it
  // passes alpha to mask (or, if initialize == true (for the first sample
@ -917,13 +918,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
        .SelectFromSwizzled(word_index & 1);
  }

-  void KillPixel(bool condition, const dxbc::Src& condition_src);
+  void KillPixel(bool condition, const dxbc::Src& condition_src,
+                 uint8_t memexport_eM_potentially_written_before);

-  void ProcessVectorAluOperation(const ParsedAluInstruction& instr,
-                                 uint32_t& result_swizzle,
-                                 bool& predicate_written);
-  void ProcessScalarAluOperation(const ParsedAluInstruction& instr,
+  void ProcessVectorAluOperation(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle,
      bool& predicate_written);
+  void ProcessScalarAluOperation(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before, bool& predicate_written);

  void WriteResourceDefinition();
  void WriteInputSignature();
@ -1124,14 +1128,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
  // writing).
  uint32_t system_temps_color_[4];

-  // Bits containing whether each eM# has been written, for up to 16 streams, or
-  // UINT32_MAX if memexport is not used. 8 bits (5 used) for each stream, with
-  // 4 `alloc export`s per component.
-  uint32_t system_temp_memexport_written_;
-  // eA in each `alloc export`, or UINT32_MAX if not used.
-  uint32_t system_temps_memexport_address_[Shader::kMaxMemExports];
-  // eM# in each `alloc export`, or UINT32_MAX if not used.
-  uint32_t system_temps_memexport_data_[Shader::kMaxMemExports][5];
+  // Memory export temporary registers are allocated if the shader writes any
+  // eM# (current_shader().memexport_eM_written() != 0).
+  // X - whether memexport is enabled for this invocation.
+  // Y - which eM# elements have been written so far by the invocation since the
+  //     last memory write.
+  uint32_t system_temp_memexport_enabled_and_eM_written_;
+  // eA.
+  uint32_t system_temp_memexport_address_;
+  // eM#.
+  uint32_t system_temps_memexport_data_[ucode::kMaxMemExportElementCount];

  // Vector ALU or fetch result / scratch (since Xenos write masks can contain
  // swizzles).
@ -1195,10 +1201,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
  uint32_t uav_index_edram_;

  std::vector<SamplerBinding> sampler_bindings_;
-
-  // Number of `alloc export`s encountered so far in the translation. The index
-  // of the current eA/eM# temp register set is this minus 1, if it's not 0.
-  uint32_t memexport_alloc_current_count_;
 };

 }  // namespace gpu
--- a/src/xenia/gpu/dxbc_shader_translator_alu.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc
@ -19,22 +19,29 @@ namespace xe {
 namespace gpu {
 using namespace ucode;

-void DxbcShaderTranslator::KillPixel(bool condition,
-                                     const dxbc::Src& condition_src) {
+void DxbcShaderTranslator::KillPixel(
+    bool condition, const dxbc::Src& condition_src,
+    uint8_t memexport_eM_potentially_written_before) {
+  a_.OpIf(condition, condition_src);
+  // Perform outstanding memory exports before the invocation becomes inactive
+  // and UAV writes are disabled.
+  ExportToMemory(memexport_eM_potentially_written_before);
  // Discard the pixel, but continue execution if other lanes in the quad need
  // this lane for derivatives. The driver may also perform early exiting
  // internally if all lanes are discarded if deemed beneficial.
-  a_.OpDiscard(condition, condition_src);
+  a_.OpDiscard(true, dxbc::Src::LU(UINT32_MAX));
  if (edram_rov_used_) {
    // Even though discarding disables all subsequent UAV/ROV writes, also skip
    // as much of the Render Backend emulation logic as possible by setting the
    // coverage and the mask of the written render targets to zero.
    a_.OpMov(dxbc::Dest::R(system_temp_rov_params_, 0b0001), dxbc::Src::LU(0));
  }
+  a_.OpEndIf();
 }

 void DxbcShaderTranslator::ProcessVectorAluOperation(
-    const ParsedAluInstruction& instr, uint32_t& result_swizzle,
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle,
    bool& predicate_written) {
  result_swizzle = dxbc::Src::kXYZW;
  predicate_written = false;
@ -506,7 +513,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
      a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
      if (used_result_components) {
        a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                 dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@ -522,7 +530,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
      a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
      if (used_result_components) {
        a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                 dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@ -538,7 +547,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
      a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
      if (used_result_components) {
        a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                 dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@ -554,7 +564,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
      a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
              dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
      if (used_result_components) {
        a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                 dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@ -640,7 +651,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
 }

 void DxbcShaderTranslator::ProcessScalarAluOperation(
-    const ParsedAluInstruction& instr, bool& predicate_written) {
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before, bool& predicate_written) {
  predicate_written = false;

  if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev) {
@ -950,27 +962,27 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(

    case AluScalarOpcode::kKillsEq:
      a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
      a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
      break;
    case AluScalarOpcode::kKillsGt:
      a_.OpLT(ps_dest, dxbc::Src::LF(0.0f), operand_0_a);
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
      a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
      break;
    case AluScalarOpcode::kKillsGe:
      a_.OpGE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
      a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
      break;
    case AluScalarOpcode::kKillsNe:
      a_.OpNE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
      a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
      break;
    case AluScalarOpcode::kKillsOne:
      a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(1.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
      a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
      break;

@ -1024,7 +1036,8 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
 }

 void DxbcShaderTranslator::ProcessAluInstruction(
-    const ParsedAluInstruction& instr) {
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before) {
  if (instr.IsNop()) {
    // Don't even disassemble or update predication.
    return;
@ -1041,10 +1054,11 @@ void DxbcShaderTranslator::ProcessAluInstruction(
  // checked again later.
  bool predicate_written_vector = false;
  uint32_t vector_result_swizzle = dxbc::Src::kXYZW;
-  ProcessVectorAluOperation(instr, vector_result_swizzle,
-                            predicate_written_vector);
+  ProcessVectorAluOperation(instr, memexport_eM_potentially_written_before,
+                            vector_result_swizzle, predicate_written_vector);
  bool predicate_written_scalar = false;
-  ProcessScalarAluOperation(instr, predicate_written_scalar);
+  ProcessScalarAluOperation(instr, memexport_eM_potentially_written_before,
+                            predicate_written_scalar);

  StoreResult(instr.vector_and_constant_result,
              dxbc::Src::R(system_temp_result_, vector_result_swizzle),
--- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@ -673,7 +673,7 @@ class Shader {
    // For implementation without unconditional support for memory writes from
    // vertex shaders, vertex shader converted to a compute shader doing only
    // memory export.
-    kMemexportCompute,
+    kMemExportCompute,

    // 4 host vertices for 1 guest vertex, for implementations without
    // unconditional geometry shader support.
@ -770,9 +770,16 @@ class Shader {
    }
  };

-  // Based on the number of AS_VS/PS_EXPORT_STREAM_* enum sets found in a game
-  // .pdb.
-  static constexpr uint32_t kMaxMemExports = 16;
+  struct ControlFlowMemExportInfo {
+    // Which eM elements have potentially (regardless of conditionals, loop
+    // iteration counts, predication) been written earlier in the predecessor
+    // graph of the instruction since an `alloc export`.
+    uint8_t eM_potentially_written_before = 0;
+    // For exec sequences, which eM elements are potentially (regardless of
+    // predication) written by the instructions in the sequence. For other
+    // control flow instructions, it's 0.
+    uint8_t eM_potentially_written_by_exec = 0;
+  };

  class Translation {
   public:
@ -880,19 +887,21 @@ class Shader {
    return constant_register_map_;
  }

-  // uint5[Shader::kMaxMemExports] - bits indicating which eM# registers have
-  // been written to after each `alloc export`, for up to Shader::kMaxMemExports
-  // exports. This will contain zero for certain corrupt exports - for those to
-  // which a valid eA was not written via a MAD with a stream constant.
-  const uint8_t* memexport_eM_written() const { return memexport_eM_written_; }
+  // Information about memory export state at each control flow instruction. May
+  // be empty if there are no eM# writes.
+  const std::vector<ControlFlowMemExportInfo>& cf_memexport_info() const {
+    return cf_memexport_info_;
+  }

-  // All c# registers used as the addend in MAD operations to eA.
+  uint8_t memexport_eM_written() const { return memexport_eM_written_; }
+  uint8_t memexport_eM_potentially_written_before_end() const {
+    return memexport_eM_potentially_written_before_end_;
+  }
+
+  // c# registers used as the addend in MAD operations to eA.
  const std::set<uint32_t>& memexport_stream_constants() const {
    return memexport_stream_constants_;
  }
-  bool is_valid_memexport_used() const {
-    return !memexport_stream_constants_.empty();
-  }

  // Labels that jumps (explicit or from loops) can be done to.
  const std::set<uint32_t>& label_addresses() const { return label_addresses_; }
@ -970,7 +979,7 @@ class Shader {
    // TODO(Triang3l): Investigate what happens to memexport when the pixel
    // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early
    // depth/stencil.
-    return !kills_pixels() && !writes_depth() && !is_valid_memexport_used();
+    return !kills_pixels() && !writes_depth() && !memexport_eM_written();
  }

  // Whether each color render target is written to on any execution path.
@ -1042,8 +1051,6 @@ class Shader {
  std::vector<VertexBinding> vertex_bindings_;
  std::vector<TextureBinding> texture_bindings_;
  ConstantRegisterMap constant_register_map_ = {0};
-  uint8_t memexport_eM_written_[kMaxMemExports] = {};
-  std::set<uint32_t> memexport_stream_constants_;
  std::set<uint32_t> label_addresses_;
  uint32_t cf_pair_index_bound_ = 0;
  uint32_t register_static_address_bound_ = 0;
@ -1055,6 +1062,17 @@ class Shader {
  bool uses_texture_fetch_instruction_results_ = false;
  bool writes_depth_ = false;

+  // Memory export eM write info for each control flow instruction, if there are
+  // any eM writes in the shader.
+  std::vector<ControlFlowMemExportInfo> cf_memexport_info_;
+  // Which memexport elements (eM#) are written for any memexport in the shader.
+  uint8_t memexport_eM_written_ = 0;
+  // ControlFlowMemExportInfo::eM_potentially_written_before equivalent for the
+  // end of the shader, for the last memory export (or exports if the end has
+  // multiple predecessor chains exporting to memory).
+  uint8_t memexport_eM_potentially_written_before_end_ = 0;
+  std::set<uint32_t> memexport_stream_constants_;
+
  // Modification bits -> translation.
  std::unordered_map<uint64_t, Translation*> translations_;

@ -1064,8 +1082,7 @@ class Shader {
  void GatherExecInformation(
      const ParsedExecInstruction& instr,
      ucode::VertexFetchInstruction& previous_vfetch_full,
-      uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
-      uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer);
+      uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer);
  void GatherVertexFetchInformation(
      const ucode::VertexFetchInstruction& op,
      ucode::VertexFetchInstruction& previous_vfetch_full,
@ -1074,13 +1091,12 @@ class Shader {
                                     uint32_t& unique_texture_bindings,
                                     StringBuffer& ucode_disasm_buffer);
  void GatherAluInstructionInformation(const ucode::AluInstruction& op,
-                                       uint32_t memexport_alloc_current_count,
-                                       uint32_t& memexport_eA_written,
+                                       uint32_t exec_cf_index,
                                       StringBuffer& ucode_disasm_buffer);
  void GatherOperandInformation(const InstructionOperand& operand);
  void GatherFetchResultInformation(const InstructionResult& result);
  void GatherAluResultInformation(const InstructionResult& result,
-                                  uint32_t memexport_alloc_current_count);
+                                  uint32_t exec_cf_index);
 };

 }  // namespace gpu
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@ -14,6 +14,7 @@
 #include <cstring>
 #include <set>
 #include <string>
+#include <utility>

 #include "xenia/base/assert.h"
 #include "xenia/base/logging.h"
@ -93,8 +94,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
  VertexFetchInstruction previous_vfetch_full;
  std::memset(&previous_vfetch_full, 0, sizeof(previous_vfetch_full));
  uint32_t unique_texture_bindings = 0;
-  uint32_t memexport_alloc_count = 0;
-  uint32_t memexport_eA_written = 0;
  for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
    ControlFlowInstruction cf_ab[2];
    UnpackControlFlowInstructions(ucode_data_.data() + i * 3, cf_ab);
@ -117,8 +116,7 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
          ParsedExecInstruction instr;
          ParseControlFlowExec(cf.exec, cf_index, instr);
          GatherExecInformation(instr, previous_vfetch_full,
-                                unique_texture_bindings, memexport_alloc_count,
-                                memexport_eA_written, ucode_disasm_buffer);
+                                unique_texture_bindings, ucode_disasm_buffer);
        } break;
        case ControlFlowOpcode::kCondExec:
        case ControlFlowOpcode::kCondExecEnd:
@ -128,16 +126,14 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
          ParsedExecInstruction instr;
          ParseControlFlowCondExec(cf.cond_exec, cf_index, instr);
          GatherExecInformation(instr, previous_vfetch_full,
-                                unique_texture_bindings, memexport_alloc_count,
-                                memexport_eA_written, ucode_disasm_buffer);
+                                unique_texture_bindings, ucode_disasm_buffer);
        } break;
        case ControlFlowOpcode::kCondExecPred:
        case ControlFlowOpcode::kCondExecPredEnd: {
          ParsedExecInstruction instr;
          ParseControlFlowCondExecPred(cf.cond_exec_pred, cf_index, instr);
          GatherExecInformation(instr, previous_vfetch_full,
-                                unique_texture_bindings, memexport_alloc_count,
-                                memexport_eA_written, ucode_disasm_buffer);
+                                unique_texture_bindings, ucode_disasm_buffer);
        } break;
        case ControlFlowOpcode::kLoopStart: {
          ParsedLoopStartInstruction instr;
@ -179,9 +175,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
          ParseControlFlowAlloc(cf.alloc, cf_index,
                                type() == xenos::ShaderType::kVertex, instr);
          instr.Disassemble(&ucode_disasm_buffer);
-          if (instr.type == AllocType::kMemory) {
-            ++memexport_alloc_count;
-          }
        } break;
        case ControlFlowOpcode::kMarkVsFetchDone:
          break;
@ -212,16 +205,124 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
    }
  }

-  // Cleanup invalid/unneeded memexport allocs.
-  for (uint32_t i = 0; i < kMaxMemExports; ++i) {
-    if (!(memexport_eA_written & (uint32_t(1) << i))) {
-      memexport_eM_written_[i] = 0;
-    } else if (!memexport_eM_written_[i]) {
-      memexport_eA_written &= ~(uint32_t(1) << i);
+  if (!cf_memexport_info_.empty()) {
+    // Gather potentially "dirty" memexport elements before each control flow
+    // instruction. `alloc` (any, not only `export`) flushes the previous memory
+    // export. On the guest GPU, yielding / serializing also terminates memory
+    // exports, but for simplicity disregarding that, as that functionally does
+    // nothing compared to flushing the previous memory export only at `alloc`
+    // or even only specifically at `alloc export`, Microsoft's validator checks
+    // if eM# aren't written after a `serialize`.
+    std::vector<uint32_t> successor_stack;
+    for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
+      ControlFlowInstruction eM_writing_cf_ab[2];
+      UnpackControlFlowInstructions(ucode_data_.data() + i * 3,
+                                    eM_writing_cf_ab);
+      for (uint32_t j = 0; j < 2; ++j) {
+        uint32_t eM_writing_cf_index = i * 2 + j;
+        uint32_t eM_written_by_cf_instr =
+            cf_memexport_info_[eM_writing_cf_index]
+                .eM_potentially_written_by_exec;
+        if (eM_writing_cf_ab[j].opcode() == ControlFlowOpcode::kCondCall) {
+          // Until subroutine calls are handled accurately, assume that all eM#
+          // have potentially been written by the subroutine for simplicity.
+          eM_written_by_cf_instr = memexport_eM_written_;
+        }
+        if (!eM_written_by_cf_instr) {
+          continue;
+        }
+
+        // If the control flow instruction potentially results in any eM# being
+        // written, mark those eM# as potentially written before each successor.
+        bool is_successor_graph_head = true;
+        successor_stack.push_back(eM_writing_cf_index);
+        while (!successor_stack.empty()) {
+          uint32_t successor_cf_index = successor_stack.back();
+          successor_stack.pop_back();
+
+          ControlFlowMemExportInfo& successor_memexport_info =
+              cf_memexport_info_[successor_cf_index];
+          if ((successor_memexport_info.eM_potentially_written_before &
+               eM_written_by_cf_instr) == eM_written_by_cf_instr) {
+            // Already marked as written before this instruction (and thus
+            // before all its successors too). Possibly this instruction is in a
+            // loop, in this case an instruction may succeed itself.
+            break;
+          }
+          // The first instruction in the traversal is the writing instruction
+          // itself, not its successor. However, if it has been visited by the
+          // traversal twice, it's in a loop, so it succeeds itself, and thus
+          // writes from it are potentially done before it too.
+          if (!is_successor_graph_head) {
+            successor_memexport_info.eM_potentially_written_before |=
+                eM_written_by_cf_instr;
+          }
+          is_successor_graph_head = false;
+
+          ControlFlowInstruction successor_cf_ab[2];
+          UnpackControlFlowInstructions(
+              ucode_data_.data() + (successor_cf_index >> 1) * 3,
+              successor_cf_ab);
+          const ControlFlowInstruction& successor_cf =
+              successor_cf_ab[successor_cf_index & 1];
+
+          bool next_instr_is_new_successor = true;
+          switch (successor_cf.opcode()) {
+            case ControlFlowOpcode::kExecEnd:
+              // One successor: end.
+              memexport_eM_potentially_written_before_end_ |=
+                  eM_written_by_cf_instr;
+              next_instr_is_new_successor = false;
+              break;
+            case ControlFlowOpcode::kCondExecEnd:
+            case ControlFlowOpcode::kCondExecPredEnd:
+            case ControlFlowOpcode::kCondExecPredCleanEnd:
+              // Two successors: next, end.
+              memexport_eM_potentially_written_before_end_ |=
+                  eM_written_by_cf_instr;
+              break;
+            case ControlFlowOpcode::kLoopStart:
+              // Two successors: next, skip.
+              successor_stack.push_back(successor_cf.loop_start.address());
+              break;
+            case ControlFlowOpcode::kLoopEnd:
+              // Two successors: next, repeat.
+              successor_stack.push_back(successor_cf.loop_end.address());
+              break;
+            case ControlFlowOpcode::kCondCall:
+              // Two successors: next, target.
+              successor_stack.push_back(successor_cf.cond_call.address());
+              break;
+            case ControlFlowOpcode::kReturn:
+              // Currently treating all subroutine calls as potentially writing
+              // all eM# for simplicity, so just exit the subroutine.
+              next_instr_is_new_successor = false;
+              break;
+            case ControlFlowOpcode::kCondJmp:
+              // One or two successors: next if conditional, target.
+              successor_stack.push_back(successor_cf.cond_jmp.address());
+              if (successor_cf.cond_jmp.is_unconditional()) {
+                next_instr_is_new_successor = false;
+              }
+              break;
+            case ControlFlowOpcode::kAlloc:
+              // Any `alloc` ends the previous export.
+              next_instr_is_new_successor = false;
+              break;
+            default:
+              break;
+          }
+          if (next_instr_is_new_successor) {
+            if (successor_cf_index < (cf_pair_index_bound_ << 1)) {
+              successor_stack.push_back(successor_cf_index + 1);
+            } else {
+              memexport_eM_potentially_written_before_end_ |=
+                  eM_written_by_cf_instr;
+            }
+          }
+        }
      }
    }
-  if (memexport_eA_written == 0) {
-    memexport_stream_constants_.clear();
  }

  is_ucode_analyzed_ = true;
@ -256,8 +357,7 @@ uint32_t Shader::GetInterpolatorInputMask(reg::SQ_PROGRAM_CNTL sq_program_cntl,
 void Shader::GatherExecInformation(
    const ParsedExecInstruction& instr,
    ucode::VertexFetchInstruction& previous_vfetch_full,
-    uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
-    uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
+    uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer) {
  instr.Disassemble(&ucode_disasm_buffer);
  uint32_t sequence = instr.sequence;
  for (uint32_t instr_offset = instr.instruction_address;
@ -279,8 +379,7 @@ void Shader::GatherExecInformation(
      }
    } else {
      auto& op = *reinterpret_cast<const AluInstruction*>(op_ptr);
-      GatherAluInstructionInformation(op, memexport_alloc_current_count,
-                                      memexport_eA_written,
+      GatherAluInstructionInformation(op, instr.dword_index,
                                      ucode_disasm_buffer);
    }
  }
@ -388,8 +487,8 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op,
 }

 void Shader::GatherAluInstructionInformation(
-    const AluInstruction& op, uint32_t memexport_alloc_current_count,
-    uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
+    const AluInstruction& op, uint32_t exec_cf_index,
+    StringBuffer& ucode_disasm_buffer) {
  ParsedAluInstruction instr;
  ParseAluInstruction(op, type(), instr);
  instr.Disassemble(&ucode_disasm_buffer);
@ -401,10 +500,8 @@ void Shader::GatherAluInstructionInformation(
      (ucode::GetAluScalarOpcodeInfo(op.scalar_opcode()).changed_state &
       ucode::kAluOpChangedStatePixelKill);

-  GatherAluResultInformation(instr.vector_and_constant_result,
-                             memexport_alloc_current_count);
-  GatherAluResultInformation(instr.scalar_result,
-                             memexport_alloc_current_count);
+  GatherAluResultInformation(instr.vector_and_constant_result, exec_cf_index);
+  GatherAluResultInformation(instr.scalar_result, exec_cf_index);
  for (size_t i = 0; i < instr.vector_operand_count; ++i) {
    GatherOperandInformation(instr.vector_operands[i]);
  }
@ -412,9 +509,7 @@ void Shader::GatherAluInstructionInformation(
    GatherOperandInformation(instr.scalar_operands[i]);
  }

-  // Store used memexport constants because CPU code needs addresses and sizes,
-  // and also whether there have been writes to eA and eM# for register
-  // allocation in shader translator implementations.
+  // Store used memexport constants because CPU code needs addresses and sizes.
  // eA is (hopefully) always written to using:
  // mad eA, r#, const0100, c#
  // (though there are some exceptions, shaders in 4D5307E6 for some reason set
@ -423,13 +518,9 @@ void Shader::GatherAluInstructionInformation(
  // Export is done to vector_dest of the ucode instruction for both vector and
  // scalar operations - no need to check separately.
  if (instr.vector_and_constant_result.storage_target ==
-          InstructionStorageTarget::kExportAddress &&
-      memexport_alloc_current_count > 0 &&
-      memexport_alloc_current_count <= Shader::kMaxMemExports) {
+      InstructionStorageTarget::kExportAddress) {
    uint32_t memexport_stream_constant = instr.GetMemExportStreamConstant();
    if (memexport_stream_constant != UINT32_MAX) {
-      memexport_eA_written |= uint32_t(1)
-                              << (memexport_alloc_current_count - 1);
      memexport_stream_constants_.insert(memexport_stream_constant);
    } else {
      XELOGE(
@ -488,8 +579,8 @@ void Shader::GatherFetchResultInformation(const InstructionResult& result) {
  }
 }

-void Shader::GatherAluResultInformation(
-    const InstructionResult& result, uint32_t memexport_alloc_current_count) {
+void Shader::GatherAluResultInformation(const InstructionResult& result,
+                                        uint32_t exec_cf_index) {
  uint32_t used_write_mask = result.GetUsedWriteMask();
  if (!used_write_mask) {
    return;
@ -511,11 +602,12 @@ void Shader::GatherAluResultInformation(
      writes_point_size_edge_flag_kill_vertex_ |= used_write_mask;
      break;
    case InstructionStorageTarget::kExportData:
-      if (memexport_alloc_current_count > 0 &&
-          memexport_alloc_current_count <= Shader::kMaxMemExports) {
-        memexport_eM_written_[memexport_alloc_current_count - 1] |=
-            uint32_t(1) << result.storage_index;
+      memexport_eM_written_ |= uint8_t(1) << result.storage_index;
+      if (cf_memexport_info_.empty()) {
+        cf_memexport_info_.resize(2 * cf_pair_index_bound_);
      }
+      cf_memexport_info_[exec_cf_index].eM_potentially_written_by_exec |=
+          uint32_t(1) << result.storage_index;
      break;
    case InstructionStorageTarget::kColor:
      writes_color_targets_ |= uint32_t(1) << result.storage_index;
@ -672,7 +764,13 @@ void ShaderTranslator::TranslateControlFlowInstruction(
    case ControlFlowOpcode::kAlloc: {
      ParsedAllocInstruction instr;
      ParseControlFlowAlloc(cf.alloc, cf_index_, is_vertex_shader(), instr);
-      ProcessAllocInstruction(instr);
+      const std::vector<Shader::ControlFlowMemExportInfo>& cf_memexport_info =
+          current_shader().cf_memexport_info();
+      ProcessAllocInstruction(instr,
+                              instr.dword_index < cf_memexport_info.size()
+                                  ? cf_memexport_info[instr.dword_index]
+                                        .eM_potentially_written_before
+                                  : 0);
    } break;
    case ControlFlowOpcode::kMarkVsFetchDone:
      break;
@ -814,6 +912,14 @@ void ParseControlFlowAlloc(const ControlFlowAllocInstruction& cf,
 void ShaderTranslator::TranslateExecInstructions(
    const ParsedExecInstruction& instr) {
  ProcessExecInstructionBegin(instr);
+
+  const std::vector<Shader::ControlFlowMemExportInfo>& cf_memexport_info =
+      current_shader().cf_memexport_info();
+  uint8_t eM_potentially_written_before =
+      instr.dword_index < cf_memexport_info.size()
+          ? cf_memexport_info[instr.dword_index].eM_potentially_written_before
+          : 0;
+
  const uint32_t* ucode_dwords = current_shader().ucode_data().data();
  uint32_t sequence = instr.sequence;
  for (uint32_t instr_offset = instr.instruction_address;
@ -839,9 +945,22 @@ void ShaderTranslator::TranslateExecInstructions(
      auto& op = *reinterpret_cast<const AluInstruction*>(op_ptr);
      ParsedAluInstruction alu_instr;
      ParseAluInstruction(op, current_shader().type(), alu_instr);
-      ProcessAluInstruction(alu_instr);
+      ProcessAluInstruction(alu_instr, eM_potentially_written_before);
+      if (alu_instr.vector_and_constant_result.storage_target ==
+              InstructionStorageTarget::kExportData &&
+          alu_instr.vector_and_constant_result.GetUsedWriteMask()) {
+        eM_potentially_written_before |=
+            uint8_t(1) << alu_instr.vector_and_constant_result.storage_index;
+      }
+      if (alu_instr.scalar_result.storage_target ==
+              InstructionStorageTarget::kExportData &&
+          alu_instr.scalar_result.GetUsedWriteMask()) {
+        eM_potentially_written_before |=
+            uint8_t(1) << alu_instr.scalar_result.storage_index;
      }
    }
+  }
+
  ProcessExecInstructionEnd(instr);
 }

--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@ -127,8 +127,10 @@ class ShaderTranslator {
  virtual void ProcessReturnInstruction(const ParsedReturnInstruction& instr) {}
  // Handles translation for jump instructions.
  virtual void ProcessJumpInstruction(const ParsedJumpInstruction& instr) {}
-  // Handles translation for alloc instructions.
-  virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr) {}
+  // Handles translation for alloc instructions. Memory exports for eM#
+  // indicated by export_eM must be performed, regardless of the alloc type.
+  virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr,
+                                       uint8_t export_eM) {}

  // Handles translation for vertex fetch instructions.
  virtual void ProcessVertexFetchInstruction(
@ -137,7 +139,13 @@ class ShaderTranslator {
  virtual void ProcessTextureFetchInstruction(
      const ParsedTextureFetchInstruction& instr) {}
  // Handles translation for ALU instructions.
-  virtual void ProcessAluInstruction(const ParsedAluInstruction& instr) {}
+  // memexport_eM_potentially_written_before needs to be handled by `kill`
+  // instruction to make sure memory exports for the eM# writes earlier in
+  // previous execs and the current exec are done before the invocation becomes
+  // inactive.
+  virtual void ProcessAluInstruction(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before) {}

 private:
  void TranslateControlFlowInstruction(const ucode::ControlFlowInstruction& cf);
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@ -134,7 +134,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
    // (32-bit only - 16-bit indices are always fetched via the Vulkan index
    // buffer).
    kSysFlag_VertexIndexLoad = 1u << kSysFlag_VertexIndexLoad_Shift,
-    // For HostVertexShaderTypes kMemexportCompute, kPointListAsTriangleStrip,
+    // For HostVertexShaderTypes kMemExportCompute, kPointListAsTriangleStrip,
    // kRectangleListAsTriangleStrip, whether the vertex index needs to be
    // loaded from the index buffer (rather than using autogenerated indices),
    // and whether it's 32-bit. This is separate from kSysFlag_VertexIndexLoad
@ -427,7 +427,9 @@ class SpirvShaderTranslator : public ShaderTranslator {
      const ParsedVertexFetchInstruction& instr) override;
  void ProcessTextureFetchInstruction(
      const ParsedTextureFetchInstruction& instr) override;
-  void ProcessAluInstruction(const ParsedAluInstruction& instr) override;
+  void ProcessAluInstruction(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before) override;

 private:
  struct TextureBinding {
@ -620,7 +622,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
    assert_true(edram_fragment_shader_interlock_);
    return !is_depth_only_fragment_shader_ &&
           !current_shader().writes_depth() &&
-           !current_shader().is_valid_memexport_used();
+           !current_shader().memexport_eM_written();
  }
  void FSI_LoadSampleMask(spv::Id msaa_samples);
  void FSI_LoadEdramOffsets(spv::Id msaa_samples);
--- a/src/xenia/gpu/spirv_shader_translator_alu.cc
+++ b/src/xenia/gpu/spirv_shader_translator_alu.cc
@ -67,7 +67,8 @@ void SpirvShaderTranslator::KillPixel(spv::Id condition) {
 }

 void SpirvShaderTranslator::ProcessAluInstruction(
-    const ParsedAluInstruction& instr) {
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before) {
  if (instr.IsNop()) {
    // Don't even disassemble or update predication.
    return;
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@ -215,7 +215,7 @@ enum class AllocType : uint32_t {
  kVsInterpolators = 2,
  // Pixel shader exports colors.
  kPsColors = 2,
-  // MEMEXPORT?
+  // Memory export.
  kMemory = 3,
 };

@ -1787,6 +1787,9 @@ inline uint32_t GetAluVectorOpNeededSourceComponents(
                          .operand_components_used[src_index - 1];
 }

+// eM# (kExportData) register count.
+constexpr uint32_t kMaxMemExportElementCount = 5;
+
 enum class ExportRegister : uint32_t {
  kVSInterpolator0 = 0,
  kVSInterpolator1,
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@ -2175,7 +2175,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
    return false;
  }
  pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
-  bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
+  bool memexport_used_vertex = vertex_shader->memexport_eM_written();

  // Pixel shader analysis.
  bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -456,6 +456,18 @@ enum class TextureFormat : uint32_t {
  k_6_5_5 = 5,
  k_8_8_8_8 = 6,
  k_2_10_10_10 = 7,
+  // Possibly similar to k_8, but may be storing alpha instead of red when
+  // resolving/memexporting, though not exactly known. From the point of view of
+  // sampling, it should be treated the same as k_8 (given that textures have
+  // the last - and single-component textures have the only - component
+  // replicated into all the remaining ones before the swizzle).
+  // Used as:
+  // - Texture in 4B4E083C - text, starting from the "Loading..." and the "This
+  //   game saves data automatically" messages. The swizzle in the fetch
+  //   constant is 111W (suggesting that internally the only component may be
+  //   the alpha one, not red).
+  // TODO(Triang3l): Investigate how k_8_A and k_8_B work in resolves and
+  // memexports, whether they store alpha/blue of the input or red.
  k_8_A = 8,
  k_8_B = 9,
  k_8_8 = 10,
@ -469,6 +481,12 @@ enum class TextureFormat : uint32_t {
  // Used for videos in 54540829.
  k_Y1_Cr_Y0_Cb_REP = 12,
  k_16_16_EDRAM = 13,
+  // Likely same as k_8_8_8_8.
+  // Used as:
+  // - Memexport destination in 4D5308BC - multiple small draws when looking
+  //   back at the door behind the player in the first room of gameplay.
+  // - Memexport destination in 4D53085B and 4D530919 - in 4D53085B, in a frame
+  //   between the intro video and the main menu, in a 8192-point draw.
  k_8_8_8_8_A = 14,
  k_4_4_4_4 = 15,
  k_10_11_11 = 16,
@ -1326,8 +1344,7 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
 // memexport on the Adreno 2xx using GL_OES_get_program_binary - it's also
 // interesting to see how alphatest interacts with it, whether it's still true
 // fixed-function alphatest, as it's claimed to be supported as usual by the
-// extension specification - it's likely, however, that memory exports are
-// discarded alongside other exports such as oC# and oDepth this way.
+// extension specification.
 //
 // Y of eA contains the offset in elements - this is what shaders are supposed
 // to calculate from something like the vertex index. Again, it's specified as
@ -1350,6 +1367,69 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
 // elements using packing via addition to 2^23, so this field also doesn't need
 // more bits than that.
 //
+// According to the sequencer specification from IPR2015-00325 (where memexport
+// is called "pass thru export"):
+// - Pass thru exports can occur anywhere in the shader program.
+// - There can be any number of pass thru exports.
+// - The address register is not kept across clause boundaries, so it must be
+//   refreshed after any Serialize (or yield), allocate instruction or resource
+//   change.
+// - The write to eM# may be predicated if the export is not needed.
+// - Exports are dropped if:
+//   - The index is above the maximum.
+//   - The index sign bit is 1.
+//   - The exponent of the index is not 23.
+// The requirement that eM4 must be written if any eM# other than eM0 is also
+// written doesn't apply to the final Xenos, it's likely an outdated note in the
+// specification considering that it's very preliminary.
+//
+// According to Microsoft's shader validator:
+// - eA can be written only by `mad`.
+// - A single eM# can be written by any number of instruction, including with
+//   write masking.
+// - eA must be written before eM#.
+// - Any alloc instruction or a `serialize` terminates the current memory
+//   export. This doesn't apply to `exec Yield=true`, however, and it's not
+//   clear if that's an oversight or if that's not considered a yield that
+//   terminates the export.
+//
+// From the emulation perspective, this means that:
+// - Alloc instructions (`alloc export` mandatorily, other allocs optionally),
+//   and optionally `serialize` instructions within `exec`, should be treated as
+//   the locations where the currently open export should be flushed to the
+//   memory. It should be taken into account that an export may be in looping
+//   control flow, and in this case it must be performed at every iteration.
+// - Whether each eM# was written to must be tracked at shader execution time,
+//   as predication can disable the export of an element.
+//
+// TODO(Triang3l): Investigate how memory export interacts with pixel killing.
+// Given that eM# writes disabled by predication don't cause an export, it's
+// possible that killed invocations are treated as inactive (invalid in Xenos
+// terms) overall, and thus new memory exports from them shouldn't be done, but
+// that's not verified. However, given that on Direct3D 11+, OpenGL and Vulkan
+// hosts, discarding disables subsequent storage resource writes, on the host,
+// it would be natural to perform all outstanding memory exports before
+// discarding if the kill condition passes.
+//
+// Memory exports can be performed to any ColorFormat, including 8bpp and 16bpp
+// ones. Hosts, however, may have the memory bound as a 32bpp buffer (for
+// instance, due to the minimum resource view size limitation on Direct3D 11).
+// In this case, bytes and shorts aren't addressable directly. However, taking
+// into account that memory accesses are coherent within one shader invocation
+// on Direct3D 11+, OpenGL and Vulkan and thus are done in order relatively to
+// each other, it should be possible to implement them by clearing the bits via
+// an atomic AND, and writing the new value using an atomic OR. This will, of
+// course, make the entire write operation non-atomic, and in case of a race
+// between writes to the same location, the final result may not even be just a
+// value from one of the invocations, but rather, it can be OR of the values
+// from any invocations involved. However, on the Xenos, there doesn't seem to
+// be any possibility of meaningfully accessing the same location from multiple
+// invocations if any of them is writing, memory exports are out-of-order, so
+// such an implementation shouldn't be causing issues in reality. Atomic
+// compare-exchange, however, should not be used for this purpose, as it may
+// result in an infinite loop if different invocations want to write different
+// values to the same memory location.
+//
 // Examples of setup in titles (Z from MSB to LSB):
 //
 // 4D5307E6 particles (different VS invocation counts, like 1, 2, 4):
@ -1385,6 +1465,11 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
 // c0: Z = 010010110000|0|010|11|011010|00011|001
 //   8in16, 16_16_16_16, uint, RGBA - from 16_16_16_16 uint vfetch
 //   (16_16_16_16 is the largest color format without special values)
+//
+// 58410B86 hierarchical depth buffer occlusion culling with the result read on
+// the CPU (15000 VS invocations in the main menu):
+// c8: Z = 010010110000|0|010|00|000010|00000|000, count = invocation count
+//   No endian swap, 8, uint, RGBA
 union alignas(uint32_t) xe_gpu_memexport_stream_t {
  struct {
    uint32_t dword_0;