From 53f98d1fe6e6d2b52b9a1f741f83ec5f6856e146 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Fri, 5 May 2023 21:05:23 +0300
Subject: [PATCH 01/14] [GPU/D3D12] Memexport from anywhere in control flow +
 8/16bpp memexport

There's no limit on the number of memory exports in a shader on the real
Xenos, and exports can be done anywhere, including in loops. Now, instead
of deferring the exports to the end of the shader, and assuming that export
allocs are executed only once, Xenia flushes exports when it reaches an
alloc (allocs terminate memory exports on Xenos, as well as individual ALU
instructions with `serialize`, but not handling this case for simplicity,
it's only truly mandatory to flush memory exports before starting a new
one), the end of the shader, or a pixel with outstanding exports is killed.

To know which eM# registers need to be flushed to the memory, traversing
the successors of each exec potentially writing any eM#, and specifying
that certain eM# registers might have potentially been written before each
reached control flow instruction, until a flush point or the end of the
shader is reached.

Also, some games export to sub-32bpp formats. These are now supported via
atomic AND clearing the bits of the dword to replace followed by an atomic
OR inserting the new byte/short.
---
 .../gpu/d3d12/d3d12_command_processor.cc      |  156 +-
 src/xenia/gpu/d3d12/d3d12_command_processor.h |   11 +-
 src/xenia/gpu/draw_util.cc                    |   63 +-
 src/xenia/gpu/draw_util.h                     |   14 +
 src/xenia/gpu/dxbc.h                          |   40 +-
 src/xenia/gpu/dxbc_shader_translator.cc       |  201 ++-
 src/xenia/gpu/dxbc_shader_translator.h        |   60 +-
 src/xenia/gpu/dxbc_shader_translator_alu.cc   |   50 +-
 .../gpu/dxbc_shader_translator_memexport.cc   | 1301 ++++++++++-------
 src/xenia/gpu/shader.h                        |   58 +-
 src/xenia/gpu/shader_translator.cc            |  211 ++-
 src/xenia/gpu/shader_translator.h             |   14 +-
 src/xenia/gpu/spirv_shader_translator.h       |    8 +-
 src/xenia/gpu/spirv_shader_translator_alu.cc  |    3 +-
 src/xenia/gpu/ucode.h                         |    5 +-
 .../gpu/vulkan/vulkan_command_processor.cc    |    2 +-
 src/xenia/gpu/xenos.h                         |   89 +-
 17 files changed, 1437 insertions(+), 849 deletions(-)

diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index b086f325b..90427f5f7 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -2125,7 +2125,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     return false;
   }
   pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
-  bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
+  bool memexport_used_vertex = vertex_shader->memexport_eM_written();
 
   // Pixel shader analysis.
   bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
@@ -2154,7 +2154,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     }
   }
   bool memexport_used_pixel =
-      pixel_shader && pixel_shader->is_valid_memexport_used();
+      pixel_shader && pixel_shader->memexport_eM_written();
   bool memexport_used = memexport_used_vertex || memexport_used_pixel;
 
   if (!BeginSubmission(true)) {
@@ -2341,100 +2341,20 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   // Gather memexport ranges and ensure the heaps for them are resident, and
   // also load the data surrounding the export and to fill the regions that
   // won't be modified by the shaders.
-  struct MemExportRange {
-    uint32_t base_address_dwords;
-    uint32_t size_dwords;
-  };
-  MemExportRange memexport_ranges[512];
-  uint32_t memexport_range_count = 0;
+  memexport_ranges_.clear();
   if (memexport_used_vertex) {
-    for (uint32_t constant_index :
-         vertex_shader->memexport_stream_constants()) {
-      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
-          XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4);
-      if (memexport_stream.index_count == 0) {
-        continue;
-      }
-      uint32_t memexport_format_size =
-          GetSupportedMemExportFormatSize(memexport_stream.format);
-      if (memexport_format_size == 0) {
-        XELOGE("Unsupported memexport format {}",
-               FormatInfo::Get(
-                   xenos::TextureFormat(uint32_t(memexport_stream.format)))
-                   ->name);
-        return false;
-      }
-      uint32_t memexport_size_dwords =
-          memexport_stream.index_count * memexport_format_size;
-      // Try to reduce the number of shared memory operations when writing
-      // different elements into the same buffer through different exports
-      // (happens in 4D5307E6).
-      bool memexport_range_reused = false;
-      for (uint32_t i = 0; i < memexport_range_count; ++i) {
-        MemExportRange& memexport_range = memexport_ranges[i];
-        if (memexport_range.base_address_dwords ==
-            memexport_stream.base_address) {
-          memexport_range.size_dwords =
-              std::max(memexport_range.size_dwords, memexport_size_dwords);
-          memexport_range_reused = true;
-          break;
-        }
-      }
-      // Add a new range if haven't expanded an existing one.
-      if (!memexport_range_reused) {
-        MemExportRange& memexport_range =
-            memexport_ranges[memexport_range_count++];
-        memexport_range.base_address_dwords = memexport_stream.base_address;
-        memexport_range.size_dwords = memexport_size_dwords;
-      }
-    }
+    draw_util::AddMemExportRanges(regs, *vertex_shader, memexport_ranges_);
   }
   if (memexport_used_pixel) {
-    for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) {
-      const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
-          XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4);
-      if (memexport_stream.index_count == 0) {
-        continue;
-      }
-      uint32_t memexport_format_size =
-          GetSupportedMemExportFormatSize(memexport_stream.format);
-      if (memexport_format_size == 0) {
-        XELOGE("Unsupported memexport format {}",
-               FormatInfo::Get(
-                   xenos::TextureFormat(uint32_t(memexport_stream.format)))
-                   ->name);
-        return false;
-      }
-      uint32_t memexport_size_dwords =
-          memexport_stream.index_count * memexport_format_size;
-      bool memexport_range_reused = false;
-      for (uint32_t i = 0; i < memexport_range_count; ++i) {
-        MemExportRange& memexport_range = memexport_ranges[i];
-        if (memexport_range.base_address_dwords ==
-            memexport_stream.base_address) {
-          memexport_range.size_dwords =
-              std::max(memexport_range.size_dwords, memexport_size_dwords);
-          memexport_range_reused = true;
-          break;
-        }
-      }
-      if (!memexport_range_reused) {
-        MemExportRange& memexport_range =
-            memexport_ranges[memexport_range_count++];
-        memexport_range.base_address_dwords = memexport_stream.base_address;
-        memexport_range.size_dwords = memexport_size_dwords;
-      }
-    }
+    draw_util::AddMemExportRanges(regs, *pixel_shader, memexport_ranges_);
   }
-  for (uint32_t i = 0; i < memexport_range_count; ++i) {
-    const MemExportRange& memexport_range = memexport_ranges[i];
+  for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
     if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
-                                      memexport_range.size_dwords << 2)) {
+                                      memexport_range.size_bytes)) {
       XELOGE(
           "Failed to request memexport stream at 0x{:08X} (size {}) in the "
           "shared memory",
-          memexport_range.base_address_dwords << 2,
-          memexport_range.size_dwords << 2);
+          memexport_range.base_address_dwords << 2, memexport_range.size_bytes);
       return false;
     }
   }
@@ -2594,17 +2514,17 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     // when memexports should be awaited?
     shared_memory_->MarkUAVWritesCommitNeeded();
     // Invalidate textures in memexported memory and watch for changes.
-    for (uint32_t i = 0; i < memexport_range_count; ++i) {
-      const MemExportRange& memexport_range = memexport_ranges[i];
+    for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
       shared_memory_->RangeWrittenByGpu(
-          memexport_range.base_address_dwords << 2,
-          memexport_range.size_dwords << 2, false);
+          memexport_range.base_address_dwords << 2, memexport_range.size_bytes,
+          false);
     }
     if (cvars::d3d12_readback_memexport) {
       // Read the exported data on the CPU.
       uint32_t memexport_total_size = 0;
-      for (uint32_t i = 0; i < memexport_range_count; ++i) {
-        memexport_total_size += memexport_ranges[i].size_dwords << 2;
+      for (const draw_util::MemExportRange& memexport_range :
+           memexport_ranges_) {
+        memexport_total_size += memexport_range.size_bytes;
       }
       if (memexport_total_size != 0) {
         ID3D12Resource* readback_buffer =
@@ -2614,9 +2534,9 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
           SubmitBarriers();
           ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
           uint32_t readback_buffer_offset = 0;
-          for (uint32_t i = 0; i < memexport_range_count; ++i) {
-            const MemExportRange& memexport_range = memexport_ranges[i];
-            uint32_t memexport_range_size = memexport_range.size_dwords << 2;
+          for (const draw_util::MemExportRange& memexport_range :
+               memexport_ranges_) {
+            uint32_t memexport_range_size = memexport_range.size_bytes;
             deferred_command_list_.D3DCopyBufferRegion(
                 readback_buffer, readback_buffer_offset, shared_memory_buffer,
                 memexport_range.base_address_dwords << 2, memexport_range_size);
@@ -2629,14 +2549,14 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
             void* readback_mapping;
             if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
                                                &readback_mapping))) {
-              const uint32_t* readback_dwords =
-                  reinterpret_cast<const uint32_t*>(readback_mapping);
-              for (uint32_t i = 0; i < memexport_range_count; ++i) {
-                const MemExportRange& memexport_range = memexport_ranges[i];
+              const uint8_t* readback_bytes =
+                  reinterpret_cast<const uint8_t*>(readback_mapping);
+              for (const draw_util::MemExportRange& memexport_range :
+                   memexport_ranges_) {
                 std::memcpy(memory_->TranslatePhysical(
                                 memexport_range.base_address_dwords << 2),
-                            readback_dwords, memexport_range.size_dwords << 2);
-                readback_dwords += memexport_range.size_dwords;
+                            readback_bytes, memexport_range.size_bytes);
+                readback_bytes += memexport_range.size_bytes;
               }
               D3D12_RANGE readback_write_range = {};
               readback_buffer->Unmap(0, &readback_write_range);
@@ -4510,36 +4430,6 @@ bool D3D12CommandProcessor::UpdateBindings(const D3D12Shader* vertex_shader,
   return true;
 }
 
-uint32_t D3D12CommandProcessor::GetSupportedMemExportFormatSize(
-    xenos::ColorFormat format) {
-  switch (format) {
-    case xenos::ColorFormat::k_8_8_8_8:
-    case xenos::ColorFormat::k_2_10_10_10:
-    // TODO(Triang3l): Investigate how k_8_8_8_8_A works - not supported in the
-    // texture cache currently.
-    // case xenos::ColorFormat::k_8_8_8_8_A:
-    case xenos::ColorFormat::k_10_11_11:
-    case xenos::ColorFormat::k_11_11_10:
-    case xenos::ColorFormat::k_16_16:
-    case xenos::ColorFormat::k_16_16_FLOAT:
-    case xenos::ColorFormat::k_32_FLOAT:
-    case xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16:
-    case xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16:
-    case xenos::ColorFormat::k_10_11_11_AS_16_16_16_16:
-    case xenos::ColorFormat::k_11_11_10_AS_16_16_16_16:
-      return 1;
-    case xenos::ColorFormat::k_16_16_16_16:
-    case xenos::ColorFormat::k_16_16_16_16_FLOAT:
-    case xenos::ColorFormat::k_32_32_FLOAT:
-      return 2;
-    case xenos::ColorFormat::k_32_32_32_32_FLOAT:
-      return 4;
-    default:
-      break;
-  }
-  return 0;
-}
-
 ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
   if (size == 0) {
     return nullptr;
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index 353e6bd0b..0574d33bc 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <utility>
+#include <vector>
 
 #include "xenia/base/assert.h"
 #include "xenia/gpu/command_processor.h"
@@ -378,13 +379,6 @@ class D3D12CommandProcessor : public CommandProcessor {
                       ID3D12RootSignature* root_signature,
                       bool shared_memory_is_uav);
 
-  // Returns dword count for one element for a memexport format, or 0 if it's
-  // not supported by the D3D12 command processor (if it's smaller that 1 dword,
-  // for instance).
-  // TODO(Triang3l): Check if any game uses memexport with formats smaller than
-  // 32 bits per element.
-  static uint32_t GetSupportedMemExportFormatSize(xenos::ColorFormat format);
-
   // Returns a buffer for reading GPU data back to the CPU. Assuming
   // synchronizing immediately after use. Always in COPY_DEST state.
   ID3D12Resource* RequestReadbackBuffer(uint32_t size);
@@ -684,6 +678,9 @@ class D3D12CommandProcessor : public CommandProcessor {
 
   // Current primitive topology.
   D3D_PRIMITIVE_TOPOLOGY primitive_topology_;
+
+  // Temporary storage for memexport stream constants used in the draw.
+  std::vector<draw_util::MemExportRange> memexport_ranges_;
 };
 
 }  // namespace d3d12
diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc
index c51cc61a0..eb61c39cb 100644
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@@ -2,7 +2,7 @@
  ******************************************************************************
  * Xenia : Xbox 360 Emulator Research Project                                 *
  ******************************************************************************
- * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Copyright 2023 Ben Vanik. All rights reserved.                             *
  * Released under the BSD license - see LICENSE in the root for more details. *
  ******************************************************************************
  */
@@ -141,7 +141,7 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
   //
   // Memory export is an obvious intentional side effect.
   if (shader.kills_pixels() || shader.writes_depth() ||
-      shader.is_valid_memexport_used() ||
+      shader.memexport_eM_written() ||
       (shader.writes_color_target(0) &&
        DoesCoverageDependOnAlpha(regs.Get<reg::RB_COLORCONTROL>()))) {
     return true;
@@ -651,6 +651,65 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
   return normalized_color_mask;
 }
 
+void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
+                        std::vector<MemExportRange>& ranges_out) {
+  if (!shader.memexport_eM_written()) {
+    // The shader has eA writes, but no real exports.
+    return;
+  }
+  uint32_t float_constants_base = shader.type() == xenos::ShaderType::kVertex
+                                      ? regs.Get<reg::SQ_VS_CONST>().base
+                                      : regs.Get<reg::SQ_PS_CONST>().base;
+  for (uint32_t constant_index : shader.memexport_stream_constants()) {
+    const auto& stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
+        XE_GPU_REG_SHADER_CONSTANT_000_X +
+        (float_constants_base + constant_index) * 4);
+    if (!stream.index_count) {
+      continue;
+    }
+    const FormatInfo& format_info =
+        *FormatInfo::Get(xenos::TextureFormat(stream.format));
+    if (format_info.type != FormatType::kResolvable) {
+      XELOGE("Unsupported memexport format {}", format_info.name);
+      // Translated shaders shouldn't be performing exports with an unknown
+      // format, the draw can still be performed.
+      continue;
+    }
+    // TODO(Triang3l): Remove the unresearched format logging when it's known
+    // how exactly these formats need to be handled (most importantly what
+    // components need to be stored and in which order).
+    switch (stream.format) {
+      case xenos::ColorFormat::k_8_A:
+      case xenos::ColorFormat::k_8_B:
+      case xenos::ColorFormat::k_8_8_8_8_A:
+        XELOGW(
+            "Memexport done to an unresearched format {}, report the game to "
+            "Xenia developers!",
+            format_info.name);
+        break;
+      default:
+        break;
+    }
+    uint32_t stream_size_bytes =
+        stream.index_count * (format_info.bits_per_pixel >> 3);
+    // Try to reduce the number of shared memory operations when writing
+    // different elements into the same buffer through different exports
+    // (happens in 4D5307E6).
+    bool range_reused = false;
+    for (MemExportRange& range : ranges_out) {
+      if (range.base_address_dwords == stream.base_address) {
+        range.size_bytes = std::max(range.size_bytes, stream_size_bytes);
+        range_reused = true;
+        break;
+      }
+    }
+    // Add a new range if haven't expanded an existing one.
+    if (!range_reused) {
+      ranges_out.emplace_back(stream.base_address, stream_size_bytes);
+    }
+  }
+}
+
 xenos::CopySampleSelect SanitizeCopySampleSelect(
     xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
     bool is_depth) {
diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h
index a365b5436..7335db27b 100644
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@@ -13,6 +13,7 @@
 #include <cmath>
 #include <cstdint>
 #include <utility>
+#include <vector>
 
 #include "xenia/base/assert.h"
 #include "xenia/gpu/register_file.h"
@@ -330,6 +331,19 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA(
   return guest_sample_index ? 3 : 0;
 }
 
+struct MemExportRange {
+  uint32_t base_address_dwords;
+  uint32_t size_bytes;
+
+  explicit MemExportRange(uint32_t base_address_dwords, uint32_t size_bytes)
+      : base_address_dwords(base_address_dwords), size_bytes(size_bytes) {}
+};
+
+// Gathers memory ranges involved in memexports in the shader with the float
+// constants from the registers, adding them to ranges_out.
+void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
+                        std::vector<MemExportRange>& ranges_out);
+
 // To avoid passing values that the shader won't understand (even though
 // Direct3D 9 shouldn't pass them anyway).
 xenos::CopySampleSelect SanitizeCopySampleSelect(
diff --git a/src/xenia/gpu/dxbc.h b/src/xenia/gpu/dxbc.h
index 42d8d89d8..57b8511c6 100644
--- a/src/xenia/gpu/dxbc.h
+++ b/src/xenia/gpu/dxbc.h
@@ -913,6 +913,8 @@ enum class OperandModifier : uint32_t {
 
 struct Dest : OperandAddress {
   // Ignored for 0-component and 1-component operand types.
+  // For 4-component operand types, if the write mask is 0, it's treated as
+  // 0-component.
   uint32_t write_mask_;
 
   // Input destinations (v*) are for use only in declarations. Vector input
@@ -1028,12 +1030,16 @@ struct Dest : OperandAddress {
   void Write(std::vector<uint32_t>& code, bool in_dcl = false) const {
     uint32_t operand_token = GetOperandTokenTypeAndIndex();
     OperandDimension dimension = GetDimension(in_dcl);
-    operand_token |= uint32_t(dimension);
     if (dimension == OperandDimension::kVector) {
-      assert_true(write_mask_ > 0b0000 && write_mask_ <= 0b1111);
-      operand_token |=
-          (uint32_t(ComponentSelection::kMask) << 2) | (write_mask_ << 4);
+      if (write_mask_) {
+        assert_true(write_mask_ <= 0b1111);
+        operand_token |=
+            (uint32_t(ComponentSelection::kMask) << 2) | (write_mask_ << 4);
+      } else {
+        dimension = OperandDimension::kNoData;
+      }
     }
+    operand_token |= uint32_t(dimension);
     code.push_back(operand_token);
     OperandAddress::Write(code);
   }
@@ -1507,6 +1513,8 @@ enum class Opcode : uint32_t {
   kStoreUAVTyped = 164,
   kLdRaw = 165,
   kStoreRaw = 166,
+  kAtomicAnd = 169,
+  kAtomicOr = 170,
   kEvalSampleIndex = 204,
   kEvalCentroid = 205,
 };
@@ -2395,6 +2403,14 @@ class Assembler {
     ++stat_.instruction_count;
     ++stat_.c_texture_store_instructions;
   }
+  void OpAtomicAnd(const Dest& dest, const Src& address,
+                   uint32_t address_components, const Src& value) {
+    EmitAtomicOp(Opcode::kAtomicAnd, dest, address, address_components, value);
+  }
+  void OpAtomicOr(const Dest& dest, const Src& address,
+                  uint32_t address_components, const Src& value) {
+    EmitAtomicOp(Opcode::kAtomicOr, dest, address, address_components, value);
+  }
   void OpEvalSampleIndex(const Dest& dest, const Src& value,
                          const Src& sample_index) {
     uint32_t dest_write_mask = dest.GetMask();
@@ -2521,6 +2537,22 @@ class Assembler {
     src1.Write(code_, true, 0b0000);
     ++stat_.instruction_count;
   }
+  void EmitAtomicOp(Opcode opcode, const Dest& dest, const Src& address,
+                    uint32_t address_components, const Src& value) {
+    // Atomic operations require a 0-component memory destination.
+    assert_zero(dest.GetMask());
+    uint32_t address_mask = (1 << address_components) - 1;
+    uint32_t operands_length = dest.GetLength() +
+                               address.GetLength(address_mask) +
+                               value.GetLength(0b0001);
+    code_.reserve(code_.size() + 1 + operands_length);
+    code_.push_back(OpcodeToken(opcode, operands_length));
+    dest.Write(code_);
+    address.Write(code_, true, address_mask);
+    value.Write(code_, true, 0b0001);
+    ++stat_.instruction_count;
+    ++stat_.c_interlocked_instructions;
+  }
 
   std::vector<uint32_t>& code_;
   Statistics& stat_;
diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc
index 5bf004a96..12a0d02b0 100644
--- a/src/xenia/gpu/dxbc_shader_translator.cc
+++ b/src/xenia/gpu/dxbc_shader_translator.cc
@@ -177,8 +177,6 @@ void DxbcShaderTranslator::Reset() {
 
   sampler_bindings_.clear();
 
-  memexport_alloc_current_count_ = 0;
-
   std::memset(&shader_feature_info_, 0, sizeof(shader_feature_info_));
   std::memset(&statistics_, 0, sizeof(statistics_));
 }
@@ -787,6 +785,63 @@ void DxbcShaderTranslator::StartPixelShader() {
       PopSystemTemp();
     }
   }
+
+  if (current_shader().memexport_eM_written()) {
+    // Make sure memexport is done only once for a guest pixel.
+    dxbc::Dest memexport_enabled_dest(
+        dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001));
+    dxbc::Src memexport_enabled_src(dxbc::Src::R(
+        system_temp_memexport_enabled_and_eM_written_, dxbc::Src::kXXXX));
+    uint32_t resolution_scaled_axes =
+        uint32_t(draw_resolution_scale_x_ > 1) |
+        (uint32_t(draw_resolution_scale_y_ > 1) << 1);
+    if (resolution_scaled_axes) {
+      uint32_t memexport_condition_temp = PushSystemTemp();
+      // Only do memexport for one host pixel in a guest pixel - prefer the
+      // host pixel closer to the center of the guest pixel, but one that's
+      // covered with the half-pixel offset according to the top-left rule (1
+      // for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x
+      // because it's the center and is covered with the half-pixel offset too).
+      in_position_used_ |= resolution_scaled_axes;
+      a_.OpFToU(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
+                dxbc::Src::V1D(in_reg_ps_position_));
+      a_.OpUDiv(dxbc::Dest::Null(),
+                dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
+                dxbc::Src::R(memexport_condition_temp),
+                dxbc::Src::LU(draw_resolution_scale_x_,
+                              draw_resolution_scale_y_, 0, 0));
+      a_.OpIEq(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
+               dxbc::Src::R(memexport_condition_temp),
+               dxbc::Src::LU(draw_resolution_scale_x_ >> 1,
+                             draw_resolution_scale_y_ >> 1, 0, 0));
+      for (uint32_t i = 0; i < 2; ++i) {
+        if (!(resolution_scaled_axes & (1 << i))) {
+          continue;
+        }
+        a_.OpAnd(memexport_enabled_dest, memexport_enabled_src,
+                 dxbc::Src::R(memexport_condition_temp).Select(i));
+      }
+      // Release memexport_condition_temp.
+      PopSystemTemp();
+    }
+    // With sample-rate shading (with float24 conversion), only do memexport
+    // from one sample (as the shader is invoked multiple times for a pixel),
+    // if SV_SampleIndex == firstbit_lo(SV_Coverage). For zero coverage,
+    // firstbit_lo returns 0xFFFFFFFF.
+    if (IsSampleRate()) {
+      uint32_t memexport_condition_temp = PushSystemTemp();
+      a_.OpFirstBitLo(dxbc::Dest::R(memexport_condition_temp, 0b0001),
+                      dxbc::Src::VCoverage());
+      a_.OpIEq(
+          dxbc::Dest::R(memexport_condition_temp, 0b0001),
+          dxbc::Src::V1D(in_reg_ps_front_face_sample_index_, dxbc::Src::kYYYY),
+          dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX));
+      a_.OpAnd(memexport_enabled_dest, memexport_enabled_src,
+               dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX));
+      // Release memexport_condition_temp.
+      PopSystemTemp();
+    }
+  }
 }
 
 void DxbcShaderTranslator::StartTranslation() {
@@ -883,34 +938,27 @@ void DxbcShaderTranslator::StartTranslation() {
     }
   }
 
-  if (!is_depth_only_pixel_shader_) {
-    // Allocate temporary registers for memexport addresses and data.
-    std::memset(system_temps_memexport_address_, 0xFF,
-                sizeof(system_temps_memexport_address_));
-    std::memset(system_temps_memexport_data_, 0xFF,
-                sizeof(system_temps_memexport_data_));
-    system_temp_memexport_written_ = UINT32_MAX;
-    const uint8_t* memexports_written = current_shader().memexport_eM_written();
-    for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) {
-      uint32_t memexport_alloc_written = memexports_written[i];
-      if (memexport_alloc_written == 0) {
-        continue;
-      }
-      // If memexport is used at all, allocate a register containing whether eM#
-      // have actually been written to.
-      if (system_temp_memexport_written_ == UINT32_MAX) {
-        system_temp_memexport_written_ = PushSystemTemp(0b1111);
-      }
-      system_temps_memexport_address_[i] = PushSystemTemp(0b1111);
-      uint32_t memexport_data_index;
-      while (xe::bit_scan_forward(memexport_alloc_written,
-                                  &memexport_data_index)) {
-        memexport_alloc_written &= ~(1u << memexport_data_index);
-        system_temps_memexport_data_[i][memexport_data_index] =
-            PushSystemTemp();
-      }
+  // Allocate temporary registers for memexport.
+  uint8_t memexport_eM_written = current_shader().memexport_eM_written();
+  if (memexport_eM_written) {
+    system_temp_memexport_enabled_and_eM_written_ = PushSystemTemp(0b0010);
+    // Initialize the memexport conditional to whether the shared memory is
+    // currently bound as UAV (to 0 or UINT32_MAX). It can be made narrower
+    // later.
+    a_.OpIBFE(
+        dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001),
+        dxbc::Src::LU(1), dxbc::Src::LU(kSysFlag_SharedMemoryIsUAV_Shift),
+        LoadFlagsSystemConstant());
+    system_temp_memexport_address_ = PushSystemTemp(0b1111);
+    uint8_t memexport_eM_remaining = memexport_eM_written;
+    uint32_t memexport_eM_index;
+    while (xe::bit_scan_forward(memexport_eM_remaining, &memexport_eM_index)) {
+      memexport_eM_remaining &= ~(uint8_t(1) << memexport_eM_index);
+      system_temps_memexport_data_[memexport_eM_index] = PushSystemTemp(0b1111);
     }
+  }
 
+  if (!is_depth_only_pixel_shader_) {
     // Allocate system temporary variables for the translated code. Since access
     // depends on the guest code (thus no guarantees), initialize everything
     // now (except for pv, it's an internal temporary variable, not accessible
@@ -1089,27 +1137,19 @@ void DxbcShaderTranslator::CompleteShaderCode() {
     // - system_temp_grad_h_lod_.
     // - system_temp_grad_v_vfetch_address_.
     PopSystemTemp(6);
+  }
 
-    // Write memexported data to the shared memory UAV.
-    ExportToMemory();
+  uint8_t memexport_eM_written = current_shader().memexport_eM_written();
+  if (memexport_eM_written) {
+    // Write data for the last memexport.
+    ExportToMemory(
+        current_shader().memexport_eM_potentially_written_before_end());
 
-    // Release memexport temporary registers.
-    for (int i = Shader::kMaxMemExports - 1; i >= 0; --i) {
-      if (system_temps_memexport_address_[i] == UINT32_MAX) {
-        continue;
-      }
-      // Release exported data registers.
-      for (int j = 4; j >= 0; --j) {
-        if (system_temps_memexport_data_[i][j] != UINT32_MAX) {
-          PopSystemTemp();
-        }
-      }
-      // Release the address register.
-      PopSystemTemp();
-    }
-    if (system_temp_memexport_written_ != UINT32_MAX) {
-      PopSystemTemp();
-    }
+    // Release memexport temporary registers:
+    // - system_temp_memexport_enabled_and_eM_written_.
+    // - system_temp_memexport_address_.
+    // - system_temps_memexport_data_.
+    PopSystemTemp(xe::bit_count(uint32_t(memexport_eM_written)) + 2);
   }
 
   // Write stage-specific epilogue.
@@ -1512,36 +1552,22 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
       dest = dxbc::Dest::R(system_temp_point_size_edge_flag_kill_vertex_);
       break;
     case InstructionStorageTarget::kExportAddress:
-      // Validate memexport writes (4D5307E6 has some completely invalid ones).
-      if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 ||
-          memexport_alloc_current_count_ > Shader::kMaxMemExports ||
-          system_temps_memexport_address_[memexport_alloc_current_count_ - 1] ==
-              UINT32_MAX) {
+      if (!current_shader().memexport_eM_written()) {
         return;
       }
-      dest = dxbc::Dest::R(
-          system_temps_memexport_address_[memexport_alloc_current_count_ - 1]);
+      dest = dxbc::Dest::R(system_temp_memexport_address_);
       break;
     case InstructionStorageTarget::kExportData: {
-      // Validate memexport writes (4D5307E6 has some completely invalid ones).
-      if (memexport_alloc_current_count_ == 0 ||
-          memexport_alloc_current_count_ > Shader::kMaxMemExports ||
-          system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
-                                      [result.storage_index] == UINT32_MAX) {
-        return;
-      }
-      dest = dxbc::Dest::R(
-          system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
-                                      [result.storage_index]);
+      assert_not_zero(current_shader().memexport_eM_written() &
+                      (uint8_t(1) << result.storage_index));
+      dest = dxbc::Dest::R(system_temps_memexport_data_[result.storage_index]);
       // Mark that the eM# has been written to and needs to be exported.
       assert_not_zero(used_write_mask);
-      uint32_t memexport_index = memexport_alloc_current_count_ - 1;
-      a_.OpOr(dxbc::Dest::R(system_temp_memexport_written_,
-                            1 << (memexport_index >> 2)),
-              dxbc::Src::R(system_temp_memexport_written_)
-                  .Select(memexport_index >> 2),
-              dxbc::Src::LU(uint32_t(1) << (result.storage_index +
-                                            ((memexport_index & 3) << 3))));
+      a_.OpOr(
+          dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010),
+          dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_,
+                       dxbc::Src::kYYYY),
+          dxbc::Src::LU(uint8_t(1) << result.storage_index));
     } break;
     case InstructionStorageTarget::kColor:
       assert_not_zero(used_write_mask);
@@ -1988,15 +2014,38 @@ void DxbcShaderTranslator::ProcessJumpInstruction(
 }
 
 void DxbcShaderTranslator::ProcessAllocInstruction(
-    const ParsedAllocInstruction& instr) {
+    const ParsedAllocInstruction& instr, uint8_t export_eM) {
+  bool start_memexport = instr.type == AllocType::kMemory &&
+                         current_shader().memexport_eM_written();
+  if (export_eM || start_memexport) {
+    CloseExecConditionals();
+  }
+
   if (emit_source_map_) {
     instruction_disassembly_buffer_.Reset();
     instr.Disassemble(&instruction_disassembly_buffer_);
     EmitInstructionDisassembly();
   }
 
-  if (instr.type == AllocType::kMemory) {
-    ++memexport_alloc_current_count_;
+  if (export_eM) {
+    ExportToMemory(export_eM);
+    // Reset which eM# elements have been written.
+    a_.OpMov(
+        dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010),
+        dxbc::Src::LU(0));
+    // Break dependencies from the previous memexport.
+    uint8_t export_eM_remaining = export_eM;
+    uint32_t eM_index;
+    while (xe::bit_scan_forward(export_eM_remaining, &eM_index)) {
+      export_eM_remaining &= ~(uint8_t(1) << eM_index);
+      a_.OpMov(dxbc::Dest::R(system_temps_memexport_data_[eM_index]),
+               dxbc::Src::LF(0.0f));
+    }
+  }
+
+  if (start_memexport) {
+    // Initialize eA to an invalid address.
+    a_.OpMov(dxbc::Dest::R(system_temp_memexport_address_), dxbc::Src::LU(0));
   }
 }
 
@@ -2849,7 +2898,7 @@ void DxbcShaderTranslator::WriteInputSignature() {
     // Sample index (SV_SampleIndex) for safe memexport with sample-rate
     // shading.
     size_t sample_index_position = SIZE_MAX;
-    if (current_shader().is_valid_memexport_used() && IsSampleRate()) {
+    if (current_shader().memexport_eM_written() && IsSampleRate()) {
       size_t sample_index_position = shader_object_.size();
       shader_object_.resize(shader_object_.size() + kParameterDwords);
       ++parameter_count;
@@ -3623,7 +3672,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
           dxbc::Name::kPosition);
     }
     bool sample_rate_memexport =
-        current_shader().is_valid_memexport_used() && IsSampleRate();
+        current_shader().memexport_eM_written() && IsSampleRate();
     // Sample-rate shading can't be done with UAV-only rendering (sample-rate
     // shading is only needed for float24 depth conversion when using a float32
     // host depth buffer).
diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h
index bcb38a21f..20fbdd328 100644
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
@@ -20,6 +20,7 @@
 #include "xenia/base/string_buffer.h"
 #include "xenia/gpu/dxbc.h"
 #include "xenia/gpu/shader_translator.h"
+#include "xenia/gpu/ucode.h"
 #include "xenia/ui/graphics_provider.h"
 
 namespace xe {
@@ -589,13 +590,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
   void ProcessLoopEndInstruction(
       const ParsedLoopEndInstruction& instr) override;
   void ProcessJumpInstruction(const ParsedJumpInstruction& instr) override;
-  void ProcessAllocInstruction(const ParsedAllocInstruction& instr) override;
+  void ProcessAllocInstruction(const ParsedAllocInstruction& instr,
+                               uint8_t export_eM) override;
 
   void ProcessVertexFetchInstruction(
       const ParsedVertexFetchInstruction& instr) override;
   void ProcessTextureFetchInstruction(
       const ParsedTextureFetchInstruction& instr) override;
-  void ProcessAluInstruction(const ParsedAluInstruction& instr) override;
+  void ProcessAluInstruction(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before) override;
 
  private:
   // IF ANY OF THESE ARE CHANGED, WriteInputSignature and WriteOutputSignature
@@ -674,6 +678,11 @@ class DxbcShaderTranslator : public ShaderTranslator {
   // Frees the last allocated internal r# registers for later reuse.
   void PopSystemTemp(uint32_t count = 1);
 
+  // ExportToMemory modifies the values of eA/eM# for simplicity, call only
+  // before starting a new export or ending the invocation or making it
+  // inactive.
+  void ExportToMemory(uint8_t export_eM);
+
   // Converts one scalar from piecewise linear gamma to linear. The target may
   // be the same as the source, the temporary variables must be different. If
   // the source is not pre-saturated, saturation will be done internally.
@@ -728,7 +737,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
   bool ROV_IsDepthStencilEarly() const {
     assert_true(edram_rov_used_);
     return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() &&
-           !current_shader().is_valid_memexport_used();
+           !current_shader().memexport_eM_written();
   }
   // Converts the pre-clamped depth value to 24-bit (storing the result in bits
   // 0:23 and zeros in 24:31, not creating room for stencil - since this may be
@@ -787,14 +796,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
   void StartPixelShader_LoadROVParameters();
   void StartPixelShader();
 
-  // Writing the epilogue.
-  // ExportToMemory modifies the values of eA/eM# for simplicity, don't call
-  // multiple times.
-  void ExportToMemory_PackFixed32(const uint32_t* eM_temps, uint32_t eM_count,
-                                  const uint32_t bits[4],
-                                  const dxbc::Src& is_integer,
-                                  const dxbc::Src& is_signed);
-  void ExportToMemory();
   void CompleteVertexOrDomainShader();
   // For RTV, adds the sample to coverage_temp.coverage_temp_component if it
   // passes alpha to mask (or, if initialize == true (for the first sample
@@ -917,13 +918,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
         .SelectFromSwizzled(word_index & 1);
   }
 
-  void KillPixel(bool condition, const dxbc::Src& condition_src);
+  void KillPixel(bool condition, const dxbc::Src& condition_src,
+                 uint8_t memexport_eM_potentially_written_before);
 
-  void ProcessVectorAluOperation(const ParsedAluInstruction& instr,
-                                 uint32_t& result_swizzle,
-                                 bool& predicate_written);
-  void ProcessScalarAluOperation(const ParsedAluInstruction& instr,
-                                 bool& predicate_written);
+  void ProcessVectorAluOperation(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle,
+      bool& predicate_written);
+  void ProcessScalarAluOperation(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before, bool& predicate_written);
 
   void WriteResourceDefinition();
   void WriteInputSignature();
@@ -1124,14 +1128,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
   // writing).
   uint32_t system_temps_color_[4];
 
-  // Bits containing whether each eM# has been written, for up to 16 streams, or
-  // UINT32_MAX if memexport is not used. 8 bits (5 used) for each stream, with
-  // 4 `alloc export`s per component.
-  uint32_t system_temp_memexport_written_;
-  // eA in each `alloc export`, or UINT32_MAX if not used.
-  uint32_t system_temps_memexport_address_[Shader::kMaxMemExports];
-  // eM# in each `alloc export`, or UINT32_MAX if not used.
-  uint32_t system_temps_memexport_data_[Shader::kMaxMemExports][5];
+  // Memory export temporary registers are allocated if the shader writes any
+  // eM# (current_shader().memexport_eM_written() != 0).
+  // X - whether memexport is enabled for this invocation.
+  // Y - which eM# elements have been written so far by the invocation since the
+  //     last memory write.
+  uint32_t system_temp_memexport_enabled_and_eM_written_;
+  // eA.
+  uint32_t system_temp_memexport_address_;
+  // eM#.
+  uint32_t system_temps_memexport_data_[ucode::kMaxMemExportElementCount];
 
   // Vector ALU or fetch result / scratch (since Xenos write masks can contain
   // swizzles).
@@ -1195,10 +1201,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
   uint32_t uav_index_edram_;
 
   std::vector<SamplerBinding> sampler_bindings_;
-
-  // Number of `alloc export`s encountered so far in the translation. The index
-  // of the current eA/eM# temp register set is this minus 1, if it's not 0.
-  uint32_t memexport_alloc_current_count_;
 };
 
 }  // namespace gpu
diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc
index 948406b90..a1d2970f0 100644
--- a/src/xenia/gpu/dxbc_shader_translator_alu.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc
@@ -19,22 +19,29 @@ namespace xe {
 namespace gpu {
 using namespace ucode;
 
-void DxbcShaderTranslator::KillPixel(bool condition,
-                                     const dxbc::Src& condition_src) {
+void DxbcShaderTranslator::KillPixel(
+    bool condition, const dxbc::Src& condition_src,
+    uint8_t memexport_eM_potentially_written_before) {
+  a_.OpIf(condition, condition_src);
+  // Perform outstanding memory exports before the invocation becomes inactive
+  // and UAV writes are disabled.
+  ExportToMemory(memexport_eM_potentially_written_before);
   // Discard the pixel, but continue execution if other lanes in the quad need
   // this lane for derivatives. The driver may also perform early exiting
   // internally if all lanes are discarded if deemed beneficial.
-  a_.OpDiscard(condition, condition_src);
+  a_.OpDiscard(true, dxbc::Src::LU(UINT32_MAX));
   if (edram_rov_used_) {
     // Even though discarding disables all subsequent UAV/ROV writes, also skip
     // as much of the Render Backend emulation logic as possible by setting the
     // coverage and the mask of the written render targets to zero.
     a_.OpMov(dxbc::Dest::R(system_temp_rov_params_, 0b0001), dxbc::Src::LU(0));
   }
+  a_.OpEndIf();
 }
 
 void DxbcShaderTranslator::ProcessVectorAluOperation(
-    const ParsedAluInstruction& instr, uint32_t& result_swizzle,
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle,
     bool& predicate_written) {
   result_swizzle = dxbc::Src::kXYZW;
   predicate_written = false;
@@ -506,7 +513,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
       a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
       if (used_result_components) {
         a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                  dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@@ -522,7 +530,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
       a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
       if (used_result_components) {
         a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                  dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@@ -538,7 +547,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
       a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
       if (used_result_components) {
         a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                  dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@@ -554,7 +564,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
       a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
               dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
-      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
+      KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
+                memexport_eM_potentially_written_before);
       if (used_result_components) {
         a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
                  dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@@ -640,7 +651,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
 }
 
 void DxbcShaderTranslator::ProcessScalarAluOperation(
-    const ParsedAluInstruction& instr, bool& predicate_written) {
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before, bool& predicate_written) {
   predicate_written = false;
 
   if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev) {
@@ -950,27 +962,27 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
 
     case AluScalarOpcode::kKillsEq:
       a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
       a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
       break;
     case AluScalarOpcode::kKillsGt:
       a_.OpLT(ps_dest, dxbc::Src::LF(0.0f), operand_0_a);
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
       a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
       break;
     case AluScalarOpcode::kKillsGe:
       a_.OpGE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
       a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
       break;
     case AluScalarOpcode::kKillsNe:
       a_.OpNE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
       a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
       break;
     case AluScalarOpcode::kKillsOne:
       a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(1.0f));
-      KillPixel(true, ps_src);
+      KillPixel(true, ps_src, memexport_eM_potentially_written_before);
       a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
       break;
 
@@ -1024,7 +1036,8 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
 }
 
 void DxbcShaderTranslator::ProcessAluInstruction(
-    const ParsedAluInstruction& instr) {
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before) {
   if (instr.IsNop()) {
     // Don't even disassemble or update predication.
     return;
@@ -1041,10 +1054,11 @@ void DxbcShaderTranslator::ProcessAluInstruction(
   // checked again later.
   bool predicate_written_vector = false;
   uint32_t vector_result_swizzle = dxbc::Src::kXYZW;
-  ProcessVectorAluOperation(instr, vector_result_swizzle,
-                            predicate_written_vector);
+  ProcessVectorAluOperation(instr, memexport_eM_potentially_written_before,
+                            vector_result_swizzle, predicate_written_vector);
   bool predicate_written_scalar = false;
-  ProcessScalarAluOperation(instr, predicate_written_scalar);
+  ProcessScalarAluOperation(instr, memexport_eM_potentially_written_before,
+                            predicate_written_scalar);
 
   StoreResult(instr.vector_and_constant_result,
               dxbc::Src::R(system_temp_result_, vector_result_swizzle),
diff --git a/src/xenia/gpu/dxbc_shader_translator_memexport.cc b/src/xenia/gpu/dxbc_shader_translator_memexport.cc
index c48facc08..1049fa739 100644
--- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc
@@ -2,533 +2,830 @@
  ******************************************************************************
  * Xenia : Xbox 360 Emulator Research Project                                 *
  ******************************************************************************
- * Copyright 2018 Ben Vanik. All rights reserved.                             *
+ * Copyright 2023 Ben Vanik. All rights reserved.                             *
  * Released under the BSD license - see LICENSE in the root for more details. *
  ******************************************************************************
  */
 
+#include <array>
+#include <cstdint>
+#include <functional>
+
 #include "xenia/base/assert.h"
 #include "xenia/base/math.h"
-#include "xenia/gpu/draw_util.h"
 #include "xenia/gpu/dxbc_shader_translator.h"
-#include "xenia/gpu/texture_cache.h"
 
 namespace xe {
 namespace gpu {
 using namespace ucode;
 
-// TODO(Triang3l): Support sub-dword memexports (like k_8 in 58410B86). This
-// would require four 128 MB R8_UINT UAVs due to
-// D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP. Need to be careful with
-// resource binding tiers, however. Resource binding tier 1 on feature level
-// 11_0 allows only 8 UAVs _across all stages_. RWByteAddressBuffer + 4 typed
-// buffers is 5 per stage already, would need 10 for both VS and PS, or even 11
-// with the eDRAM ROV. Need to drop draw commands doing memexport in both VS and
-// PS on FL 11_0 resource binding tier 1.
-
-void DxbcShaderTranslator::ExportToMemory_PackFixed32(
-    const uint32_t* eM_temps, uint32_t eM_count, const uint32_t bits[4],
-    const dxbc::Src& is_integer, const dxbc::Src& is_signed) {
-  // Will insert with BFI - sign extension of red will be overwritten, not
-  // truncated.
-  assert_not_zero(bits[0]);
-  assert_true(bits[0] + bits[1] + bits[2] + bits[3] == 32);
-  uint32_t mask = 0;
-  for (uint32_t i = 0; i < 4; ++i) {
-    if (bits[i]) {
-      mask |= 1 << i;
-    }
-  }
-  a_.OpIf(true, is_signed);
-  {
-    float range[4];
-    for (uint32_t i = 0; i < 4; ++i) {
-      range[i] = bits[i] ? float((uint32_t(1) << (bits[i] - 1)) - 1) : 0.0f;
-    }
-    dxbc::Src range_src(dxbc::Src::LP(range));
-    a_.OpIf(false, is_integer);
-    for (uint32_t i = 0; i < eM_count; ++i) {
-      uint32_t eM_temp = eM_temps[i];
-      a_.OpMul(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp), range_src);
-    }
-    a_.OpEndIf();
-    for (uint32_t i = 0; i < eM_count; ++i) {
-      dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[i], mask));
-      dxbc::Src eM_src(dxbc::Src::R(eM_temps[i]));
-      // TODO(Triang3l): NaN should become zero, not -range.
-      a_.OpMax(eM_dest, eM_src, -range_src);
-      a_.OpMin(eM_dest, eM_src, range_src);
-    }
-  }
-  a_.OpElse();
-  {
-    float range[4];
-    for (uint32_t i = 0; i < 4; ++i) {
-      range[i] = float((uint32_t(1) << bits[i]) - 1);
-    }
-    dxbc::Src range_src(dxbc::Src::LP(range));
-    a_.OpIf(false, is_integer);
-    for (uint32_t i = 0; i < eM_count; ++i) {
-      uint32_t eM_temp = eM_temps[i];
-      a_.OpMul(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp), range_src);
-    }
-    a_.OpEndIf();
-    for (uint32_t i = 0; i < eM_count; ++i) {
-      dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[i], mask));
-      dxbc::Src eM_src(dxbc::Src::R(eM_temps[i]));
-      a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(0.0f));
-      a_.OpMin(eM_dest, eM_src, range_src);
-    }
-  }
-  a_.OpEndIf();
-  for (uint32_t i = 0; i < eM_count; ++i) {
-    uint32_t eM_temp = eM_temps[i];
-    // Round to the nearest integer, according to the rules of handling integer
-    // formats in Direct3D.
-    // TODO(Triang3l): Round by adding +-0.5, not with round_ne.
-    a_.OpRoundNE(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp));
-    a_.OpFToI(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp));
-    dxbc::Dest eM_packed_dest(dxbc::Dest::R(eM_temp, 0b0001));
-    dxbc::Src eM_packed_src(dxbc::Src::R(eM_temp, dxbc::Src::kXXXX));
-    uint32_t offset = bits[0];
-    for (uint32_t j = 1; j < 4; ++j) {
-      if (!bits[j]) {
-        continue;
-      }
-      a_.OpBFI(eM_packed_dest, dxbc::Src::LU(bits[j]), dxbc::Src::LU(offset),
-               dxbc::Src::R(eM_temp).Select(j), eM_packed_src);
-      offset += bits[j];
-    }
-  }
-}
-
-void DxbcShaderTranslator::ExportToMemory() {
-  if (system_temp_memexport_written_ == UINT32_MAX) {
-    // No exports in the shader.
+void DxbcShaderTranslator::ExportToMemory(uint8_t export_eM) {
+  if (!export_eM) {
     return;
   }
 
-  // Allocate a register for temporary values at various stages.
-  uint32_t control_temp = PushSystemTemp();
+  assert_zero(export_eM & ~current_shader().memexport_eM_written());
 
-  // Safety check if the shared memory is bound as UAV.
-  a_.OpUBFE(dxbc::Dest::R(control_temp, 0b0001), dxbc::Src::LU(1),
-            dxbc::Src::LU(kSysFlag_SharedMemoryIsUAV_Shift),
-            LoadFlagsSystemConstant());
-  // Open the `if` with the uniform condition for the shared memory buffer being
-  // bound as a UAV (more fine-grained checks are vector and likely divergent).
-  a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
+  // Check if memory export is allowed in this invocation.
+  a_.OpIf(true, dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_,
+                             dxbc::Src::kXXXX));
 
-  // Check more fine-grained limitations.
-  bool inner_condition_provided = false;
-  if (is_pixel_shader()) {
-    uint32_t resolution_scaled_axes =
-        uint32_t(draw_resolution_scale_x_ > 1) |
-        (uint32_t(draw_resolution_scale_y_ > 1) << 1);
-    if (resolution_scaled_axes) {
-      // Only do memexport for one host pixel in a guest pixel - prefer the
-      // host pixel closer to the center of the guest pixel, but one that's
-      // covered with the half-pixel offset according to the top-left rule (1
-      // for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x
-      // because it's the center and is covered with the half-pixel offset too).
-      // Using control_temp.yz as per-axis temporary variables.
-      in_position_used_ |= resolution_scaled_axes;
-      a_.OpFToU(dxbc::Dest::R(control_temp, resolution_scaled_axes << 1),
-                dxbc::Src::V1D(in_reg_ps_position_, 0b0100 << 2));
-      a_.OpUDiv(dxbc::Dest::Null(),
-                dxbc::Dest::R(control_temp, resolution_scaled_axes << 1),
-                dxbc::Src::R(control_temp, 0b1001 << 2),
-                dxbc::Src::LU(0, draw_resolution_scale_x_,
-                              draw_resolution_scale_y_, 0));
-      for (uint32_t i = 0; i < 2; ++i) {
-        if (!(resolution_scaled_axes & (1 << i))) {
-          continue;
-        }
-        // If there's no inner condition in control_temp.x yet, the condition
-        // for the current axis can go directly to it. Otherwise, need to merge
-        // with the previous condition, using control_temp.y or .z as an
-        // intermediate variable.
-        dxbc::Src resolution_scaled_axis_src(
-            dxbc::Src::R(control_temp).Select(1 + i));
-        a_.OpIEq(
-            dxbc::Dest::R(control_temp,
-                          inner_condition_provided ? 1 << (1 + i) : 0b0001),
-            resolution_scaled_axis_src,
-            dxbc::Src::LU(
-                (i ? draw_resolution_scale_y_ : draw_resolution_scale_x_) >>
-                1));
-        if (inner_condition_provided) {
-          // Merge with the previous condition in control_temp.x.
-          a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
-                   dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
-                   resolution_scaled_axis_src);
-        }
-        inner_condition_provided = true;
-      }
-    }
-    // With sample-rate shading (with float24 conversion), only do memexport
-    // from one sample (as the shader is invoked multiple times for a pixel),
-    // if SV_SampleIndex == firstbit_lo(SV_Coverage). For zero coverage,
-    // firstbit_lo returns 0xFFFFFFFF.
-    if (IsSampleRate()) {
-      a_.OpFirstBitLo(dxbc::Dest::R(control_temp, 0b0010),
-                      dxbc::Src::VCoverage());
-      a_.OpIEq(
-          dxbc::Dest::R(control_temp,
-                        inner_condition_provided ? 0b0010 : 0b0001),
-          dxbc::Src::V1D(in_reg_ps_front_face_sample_index_, dxbc::Src::kYYYY),
-          dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
-      if (inner_condition_provided) {
-        // Merge with the previous condition in control_temp.x.
-        a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
-                 dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
-                 dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
-      }
-      inner_condition_provided = true;
-    }
-  }
-  // Open the inner (vector) conditional if needed.
-  if (inner_condition_provided) {
-    a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
-  }
-  // control_temp.x is now free.
-
-  for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) {
-    uint32_t eA_temp = system_temps_memexport_address_[i];
-    if (eA_temp == UINT32_MAX) {
-      // Export not used.
-      continue;
-    }
-    // For simplicity of access, gather actually used eM# registers for this
-    // export. Zero-initialize eM_offsets because excess elements of it may be
-    // accessed, for stable caching.
-    uint32_t eM_temps[5], eM_offsets[5] = {}, eM_count = 0;
-    for (uint32_t j = 0; j < 5; ++j) {
-      uint32_t eM_temp = system_temps_memexport_data_[i][j];
-      if (eM_temp == UINT32_MAX) {
-        continue;
-      }
-      eM_temps[eM_count] = eM_temp;
-      eM_offsets[eM_count] = j;
-      ++eM_count;
-    }
-    if (eM_count == 0) {
-      continue;
-    }
-
-    // Swap red and blue if needed.
-    a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
-             dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ),
-             dxbc::Src::LU(uint32_t(1) << 19));
-    for (uint32_t j = 0; j < eM_count; ++j) {
-      uint32_t eM_temp = eM_temps[j];
-      a_.OpMovC(dxbc::Dest::R(eM_temp, 0b0101),
-                dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
-                dxbc::Src::R(eM_temp, 0b000010), dxbc::Src::R(eM_temp));
-    }
-
-    // Initialize element size in control_temp.x to 4 bytes as this is the most
-    // common size.
-    dxbc::Dest element_size_dest(dxbc::Dest::R(control_temp, 0b0001));
-    dxbc::Src element_size_src(dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
-    a_.OpMov(element_size_dest, dxbc::Src::LU(4));
-
-    // Each eM should get a packed value in the destination format now.
-
-    // Extract format properties to control_temp.
-    // Y - signedness if fixed-point.
-    // Z - fractional/integer if fixed-point.
-    // W - color format.
-    a_.OpUBFE(dxbc::Dest::R(control_temp, 0b1110), dxbc::Src::LU(0, 1, 1, 6),
-              dxbc::Src::LU(0, 16, 17, 8),
-              dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ));
-    dxbc::Src is_signed(dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
-    dxbc::Src is_integer(dxbc::Src::R(control_temp, dxbc::Src::kZZZZ));
-    // Convert and pack the format.
-    a_.OpSwitch(dxbc::Src::R(control_temp, dxbc::Src::kWWWW));
-    // control_temp.w is now free.
-    {
-      // k_8_8_8_8
-      // k_8_8_8_8_AS_16_16_16_16
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8)));
-      a_.OpCase(dxbc::Src::LU(
-          uint32_t(xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16)));
-      {
-        uint32_t bits[4] = {8, 8, 8, 8};
-        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
-                                   is_signed);
-      }
-      a_.OpBreak();
-
-      // k_2_10_10_10
-      // k_2_10_10_10_AS_16_16_16_16
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_2_10_10_10)));
-      a_.OpCase(dxbc::Src::LU(
-          uint32_t(xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16)));
-      {
-        uint32_t bits[4] = {10, 10, 10, 2};
-        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
-                                   is_signed);
-      }
-      a_.OpBreak();
-
-      // k_10_11_11
-      // k_10_11_11_AS_16_16_16_16
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_10_11_11)));
-      a_.OpCase(dxbc::Src::LU(
-          uint32_t(xenos::ColorFormat::k_10_11_11_AS_16_16_16_16)));
-      {
-        uint32_t bits[4] = {11, 11, 10};
-        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
-                                   is_signed);
-      }
-      a_.OpBreak();
-
-      // k_11_11_10
-      // k_11_11_10_AS_16_16_16_16
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_11_11_10)));
-      a_.OpCase(dxbc::Src::LU(
-          uint32_t(xenos::ColorFormat::k_11_11_10_AS_16_16_16_16)));
-      {
-        uint32_t bits[4] = {10, 11, 11};
-        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
-                                   is_signed);
-      }
-      a_.OpBreak();
-
-      // k_16_16
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16)));
-      {
-        uint32_t bits[4] = {16, 16};
-        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
-                                   is_signed);
-      }
-      a_.OpBreak();
-
-      // k_16_16_16_16
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16)));
-      a_.OpMov(element_size_dest, dxbc::Src::LU(8));
-      a_.OpIf(true, is_signed);
-      {
-        a_.OpIf(false, is_integer);
-        for (uint32_t j = 0; j < eM_count; ++j) {
-          uint32_t eM_temp = eM_temps[j];
-          a_.OpMul(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp),
-                   dxbc::Src::LF(32767.0f));
-        }
-        a_.OpEndIf();
-        for (uint32_t j = 0; j < eM_count; ++j) {
-          dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
-          dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
-          // TODO(Triang3l): NaN should become zero, not -range.
-          a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(-32767.0f));
-          a_.OpMin(eM_dest, eM_src, dxbc::Src::LF(32767.0f));
-        }
-      }
-      a_.OpElse();
-      {
-        a_.OpIf(false, is_integer);
-        for (uint32_t j = 0; j < eM_count; ++j) {
-          uint32_t eM_temp = eM_temps[j];
-          a_.OpMul(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp),
-                   dxbc::Src::LF(65535.0f));
-        }
-        a_.OpEndIf();
-        for (uint32_t j = 0; j < eM_count; ++j) {
-          dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
-          dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
-          a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(0.0f));
-          a_.OpMin(eM_dest, eM_src, dxbc::Src::LF(65535.0f));
-        }
-      }
-      a_.OpEndIf();
-      for (uint32_t j = 0; j < eM_count; ++j) {
-        uint32_t eM_temp = eM_temps[j];
-        // Round to the nearest integer, according to the rules of handling
-        // integer formats in Direct3D.
-        // TODO(Triang3l): Round by adding +-0.5, not with round_ne.
-        a_.OpRoundNE(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp));
-        a_.OpFToI(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp));
-        a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::LU(16),
-                 dxbc::Src::LU(16), dxbc::Src::R(eM_temp, 0b1101),
-                 dxbc::Src::R(eM_temp, 0b1000));
-      }
-      a_.OpBreak();
-
-      // k_16_16_FLOAT
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_FLOAT)));
-      for (uint32_t j = 0; j < eM_count; ++j) {
-        uint32_t eM_temp = eM_temps[j];
-        a_.OpF32ToF16(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::R(eM_temp));
-        a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0001), dxbc::Src::LU(16),
-                 dxbc::Src::LU(16), dxbc::Src::R(eM_temp, dxbc::Src::kYYYY),
-                 dxbc::Src::R(eM_temp, dxbc::Src::kXXXX));
-      }
-      a_.OpBreak();
-
-      // k_16_16_16_16_FLOAT
-      a_.OpCase(
-          dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16_FLOAT)));
-      a_.OpMov(element_size_dest, dxbc::Src::LU(8));
-      for (uint32_t j = 0; j < eM_count; ++j) {
-        uint32_t eM_temp = eM_temps[j];
-        a_.OpF32ToF16(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp));
-        a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::LU(16),
-                 dxbc::Src::LU(16), dxbc::Src::R(eM_temp, 0b1101),
-                 dxbc::Src::R(eM_temp, 0b1000));
-      }
-      a_.OpBreak();
-
-      // k_32_FLOAT
-      // Already in the destination format, 4 bytes per element already
-      // selected.
-
-      // k_32_32_FLOAT
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_FLOAT)));
-      a_.OpMov(element_size_dest, dxbc::Src::LU(8));
-      // Already in the destination format.
-      a_.OpBreak();
-
-      // k_32_32_32_32_FLOAT
-      a_.OpCase(
-          dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_32_32_FLOAT)));
-      a_.OpMov(element_size_dest, dxbc::Src::LU(16));
-      // Already in the destination format.
-      a_.OpBreak();
-    }
-    a_.OpEndSwitch();
-    // control_temp.yz are now free.
-
-    // Do endian swap.
-    {
-      dxbc::Dest endian_dest(dxbc::Dest::R(control_temp, 0b0010));
-      dxbc::Src endian_src(dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
-      // Extract endianness into control_temp.y.
-      a_.OpAnd(endian_dest, dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ),
-               dxbc::Src::LU(0b111));
-
-      // Change 8-in-64 and 8-in-128 to 8-in-32.
-      for (uint32_t j = 0; j < 2; ++j) {
-        a_.OpIEq(dxbc::Dest::R(control_temp, 0b0100), endian_src,
-                 dxbc::Src::LU(uint32_t(j ? xenos::Endian128::k8in128
-                                          : xenos::Endian128::k8in64)));
-        for (uint32_t k = 0; k < eM_count; ++k) {
-          uint32_t eM_temp = eM_temps[k];
-          a_.OpMovC(dxbc::Dest::R(eM_temp),
-                    dxbc::Src::R(control_temp, dxbc::Src::kZZZZ),
-                    dxbc::Src::R(eM_temp, j ? 0b00011011 : 0b10110001),
-                    dxbc::Src::R(eM_temp));
-        }
-        a_.OpMovC(endian_dest, dxbc::Src::R(control_temp, dxbc::Src::kZZZZ),
-                  dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)),
-                  endian_src);
-      }
-
-      uint32_t swap_temp = PushSystemTemp();
-      dxbc::Dest swap_temp_dest(dxbc::Dest::R(swap_temp));
-      dxbc::Src swap_temp_src(dxbc::Src::R(swap_temp));
-
-      // 8-in-16 or one half of 8-in-32.
-      a_.OpSwitch(endian_src);
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in16)));
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)));
-      for (uint32_t j = 0; j < eM_count; ++j) {
-        dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
-        dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
-        // Temp = X0Z0.
-        a_.OpAnd(swap_temp_dest, eM_src, dxbc::Src::LU(0x00FF00FF));
-        // eM = YZW0.
-        a_.OpUShR(eM_dest, eM_src, dxbc::Src::LU(8));
-        // eM = Y0W0.
-        a_.OpAnd(eM_dest, eM_src, dxbc::Src::LU(0x00FF00FF));
-        // eM = YXWZ.
-        a_.OpUMAd(eM_dest, swap_temp_src, dxbc::Src::LU(256), eM_src);
-      }
-      a_.OpBreak();
-      a_.OpEndSwitch();
-
-      // 16-in-32 or another half of 8-in-32.
-      a_.OpSwitch(endian_src);
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)));
-      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k16in32)));
-      for (uint32_t j = 0; j < eM_count; ++j) {
-        dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
-        dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
-        // Temp = ZW00.
-        a_.OpUShR(swap_temp_dest, eM_src, dxbc::Src::LU(16));
-        // eM = ZWXY.
-        a_.OpBFI(eM_dest, dxbc::Src::LU(16), dxbc::Src::LU(16), eM_src,
-                 swap_temp_src);
-      }
-      a_.OpBreak();
-      a_.OpEndSwitch();
-
-      // Release swap_temp.
-      PopSystemTemp();
-    }
-    // control_temp.yz are now free.
-
-    dxbc::Dest address_dest(dxbc::Dest::R(eA_temp, 0b0001));
-    dxbc::Src address_src(dxbc::Src::R(eA_temp, dxbc::Src::kXXXX));
-    // Multiply the base address by dword size, also dropping the 0x40000000
-    // bit.
-    a_.OpIShL(address_dest, address_src, dxbc::Src::LU(2));
-    // Drop the exponent in the element index.
-    a_.OpAnd(dxbc::Dest::R(eA_temp, 0b0010),
-             dxbc::Src::R(eA_temp, dxbc::Src::kYYYY),
-             dxbc::Src::LU((1 << 23) - 1));
-    // Add the offset of the first written element to the base address.
-    a_.OpUMAd(address_dest, dxbc::Src::R(eA_temp, dxbc::Src::kYYYY),
-              element_size_src, address_src);
-    // Do the writes.
-    dxbc::Src eM_written_src(
-        dxbc::Src::R(system_temp_memexport_written_).Select(i >> 2));
-    uint32_t eM_written_base = 1u << ((i & 3) << 3);
-    for (uint32_t j = 0; j < eM_count; ++j) {
-      // Go to the next eM#.
-      uint32_t eM_relative_offset = eM_offsets[j] - (j ? eM_offsets[j - 1] : 0);
-      if (eM_relative_offset) {
-        if (eM_relative_offset == 1) {
-          a_.OpIAdd(address_dest, element_size_src, address_src);
-        } else {
-          a_.OpUMAd(address_dest, dxbc::Src::LU(eM_relative_offset),
-                    element_size_src, address_src);
-        }
-      }
-      // Check if the eM# was actually written to on the execution path.
-      a_.OpAnd(dxbc::Dest::R(control_temp, 0b0010), eM_written_src,
-               dxbc::Src::LU(eM_written_base << eM_offsets[j]));
-      a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
-      // Write the element of the needed size.
-      dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
-      a_.OpSwitch(element_size_src);
-      for (uint32_t k = 1; k <= 4; k <<= 1) {
-        a_.OpCase(dxbc::Src::LU(k * 4));
-        if (uav_index_shared_memory_ == kBindingIndexUnallocated) {
-          uav_index_shared_memory_ = uav_count_++;
-        }
-        a_.OpStoreRaw(
-            dxbc::Dest::U(uav_index_shared_memory_,
-                          uint32_t(UAVRegister::kSharedMemory), (1 << k) - 1),
-            address_src, eM_src);
-        a_.OpBreak();
-      }
-      a_.OpEndSwitch();
-      a_.OpEndIf();
-    }
-    // control_temp.y is now free.
+  // Check if the address with the correct sign and exponent was written, and
+  // that the index doesn't overflow the mantissa bits.
+  {
+    uint32_t address_check_temp = PushSystemTemp();
+    a_.OpUShR(dxbc::Dest::R(address_check_temp),
+              dxbc::Src::R(system_temp_memexport_address_),
+              dxbc::Src::LU(30, 23, 23, 23));
+    a_.OpIEq(dxbc::Dest::R(address_check_temp),
+             dxbc::Src::R(address_check_temp),
+             dxbc::Src::LU(0x1, 0x96, 0x96, 0x96));
+    a_.OpAnd(dxbc::Dest::R(address_check_temp, 0b0011),
+             dxbc::Src::R(address_check_temp),
+             dxbc::Src::R(address_check_temp, 0b1110));
+    a_.OpAnd(dxbc::Dest::R(address_check_temp, 0b0001),
+             dxbc::Src::R(address_check_temp, dxbc::Src::kXXXX),
+             dxbc::Src::R(address_check_temp, dxbc::Src::kYYYY));
+    a_.OpIf(true, dxbc::Src::R(address_check_temp, dxbc::Src::kXXXX));
+    // Release address_check_temp.
+    PopSystemTemp();
   }
 
-  // Close the inner memexport possibility conditional.
-  if (inner_condition_provided) {
+  uint8_t eM_remaining;
+  uint32_t eM_index;
+
+  // Swap red and blue components if needed.
+  {
+    uint32_t red_blue_swap_temp = PushSystemTemp();
+    a_.OpIBFE(dxbc::Dest::R(red_blue_swap_temp, 0b0001), dxbc::Src::LU(1),
+              dxbc::Src::LU(19),
+              dxbc::Src::R(system_temp_memexport_address_, dxbc::Src::kZZZZ));
+    a_.OpIf(true, dxbc::Src::R(red_blue_swap_temp, dxbc::Src::kXXXX));
+    // Release red_blue_swap_temp.
+    PopSystemTemp();
+
+    eM_remaining = export_eM;
+    while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+      eM_remaining &= ~(uint8_t(1) << eM_index);
+      a_.OpMov(
+          dxbc::Dest::R(system_temps_memexport_data_[eM_index], 0b0101),
+          dxbc::Src::R(system_temps_memexport_data_[eM_index], 0b11000110));
+    }
+
+    // Close the red/blue swap conditional.
     a_.OpEndIf();
   }
 
-  // Close the outer memexport possibility conditional.
+  uint32_t temp = PushSystemTemp();
+
+  // Extract the color format and the numeric format.
+  // temp.x = color format.
+  // temp.y = numeric format is signed.
+  // temp.z = numeric format is integer.
+  a_.OpUBFE(dxbc::Dest::R(temp, 0b0111), dxbc::Src::LU(6, 1, 1, 0),
+            dxbc::Src::LU(8, 16, 17, 0),
+            dxbc::Src::R(system_temp_memexport_address_, dxbc::Src::kZZZZ));
+
+  // Perform format packing.
+  // After the switch, temp.x must contain log2 of the number of bytes in an
+  // element, of UINT32_MAX if the format is unknown.
+  a_.OpSwitch(dxbc::Src::R(temp, dxbc::Src::kXXXX));
+  {
+    dxbc::Dest element_size_dest(dxbc::Dest::R(temp, 0b0001));
+    dxbc::Src num_format_signed(dxbc::Src::R(temp, dxbc::Src::kYYYY));
+    dxbc::Src num_format_integer(dxbc::Src::R(temp, dxbc::Src::kZZZZ));
+
+    auto flush_nan = [this, export_eM](uint32_t components) {
+      uint8_t eM_remaining = export_eM;
+      uint32_t eM_index;
+      uint32_t is_nan_temp = PushSystemTemp();
+      while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+        eM_remaining &= ~(uint8_t(1) << eM_index);
+        uint32_t eM = system_temps_memexport_data_[eM_index];
+        a_.OpNE(dxbc::Dest::R(is_nan_temp, components), dxbc::Src::R(eM),
+                dxbc::Src::R(eM));
+        a_.OpMovC(dxbc::Dest::R(eM, components), dxbc::Src::R(is_nan_temp),
+                  dxbc::Src::LF(0.0f), dxbc::Src::R(eM));
+      }
+      // Release is_nan_temp.
+      PopSystemTemp();
+    };
+
+    // The result will be in eM#.x. The widths must be without holes (R, RG,
+    // RGB, RGBA), and expecting the widths to add up to the size of the stored
+    // texel (8, 16 or 32 bits), as the unused upper bits will contain junk from
+    // the sign extension of X if the number is signed.
+    auto pack_8_16_32 = [&](std::array<uint32_t, 4> widths) {
+      uint8_t eM_remaining;
+      uint32_t eM_index;
+
+      uint32_t components = 0;
+      std::array<uint32_t, 4> offsets = {};
+      for (uint32_t i = 0; i < 4; ++i) {
+        if (widths[i]) {
+          // Only formats for which max + 0.5 can be represented exactly.
+          assert(widths[i] <= 23);
+          components |= uint32_t(1) << i;
+        }
+        if (i) {
+          offsets[i] = offsets[i - 1] + widths[i - 1];
+        }
+      }
+      // Will be packing components into eM#.x starting from green, assume red
+      // will already be there after the conversion.
+      assert_not_zero(components & 0b1);
+
+      flush_nan(components);
+
+      a_.OpIf(true, num_format_signed);
+      {
+        // Signed.
+        a_.OpIf(true, num_format_integer);
+        {
+          // Signed integer.
+          float min_value[4] = {}, max_value[4] = {};
+          for (uint32_t i = 0; i < 4; ++i) {
+            if (widths[i]) {
+              max_value[i] = float((uint32_t(1) << (widths[i] - 1)) - 1);
+              min_value[i] = -1.0f - max_value[i];
+            }
+          }
+          dxbc::Src min_value_src(dxbc::Src::LP(min_value));
+          dxbc::Src max_value_src(dxbc::Src::LP(max_value));
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            a_.OpMax(dxbc::Dest::R(eM, components), min_value_src,
+                     dxbc::Src::R(eM));
+            a_.OpMin(dxbc::Dest::R(eM, components), max_value_src,
+                     dxbc::Src::R(eM));
+          }
+        }
+        a_.OpElse();
+        {
+          // Signed normalized.
+          uint32_t scale_components = 0;
+          float scale[4] = {};
+          for (uint32_t i = 0; i < 4; ++i) {
+            if (widths[i] > 2) {
+              scale_components |= uint32_t(1) << i;
+              scale[i] = float((uint32_t(1) << (widths[i] - 1)) - 1);
+            }
+          }
+          dxbc::Src scale_src(dxbc::Src::LP(scale));
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            a_.OpMax(dxbc::Dest::R(eM, components), dxbc::Src::LF(-1.0f),
+                     dxbc::Src::R(eM));
+            a_.OpMin(dxbc::Dest::R(eM, components), dxbc::Src::LF(1.0f),
+                     dxbc::Src::R(eM));
+            if (scale_components) {
+              a_.OpMul(dxbc::Dest::R(eM, scale_components), dxbc::Src::R(eM),
+                       scale_src);
+            }
+          }
+        }
+        a_.OpEndIf();
+
+        // Add plus/minus 0.5 before truncating according to the Direct3D format
+        // conversion rules, and convert to signed integers.
+        uint32_t round_bias_temp = PushSystemTemp();
+        eM_remaining = export_eM;
+        while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+          eM_remaining &= ~(uint8_t(1) << eM_index);
+          uint32_t eM = system_temps_memexport_data_[eM_index];
+          a_.OpBFI(dxbc::Dest::R(eM, components), dxbc::Src::LU(31),
+                   dxbc::Src::LU(0), dxbc::Src::LF(0.5f), dxbc::Src::R(eM));
+          a_.OpAdd(dxbc::Dest::R(eM, components), dxbc::Src::R(eM),
+                   dxbc::Src::R(round_bias_temp));
+          a_.OpFToI(dxbc::Dest::R(eM, components), dxbc::Src::R(eM));
+        }
+        // Release round_bias_temp.
+        PopSystemTemp();
+      }
+      a_.OpElse();
+      {
+        // Unsigned.
+        a_.OpIf(true, num_format_integer);
+        {
+          // Unsigned integer.
+          float max_value[4];
+          for (uint32_t i = 0; i < 4; ++i) {
+            max_value[i] = float((uint32_t(1) << widths[i]) - 1);
+          }
+          dxbc::Src max_value_src(dxbc::Src::LP(max_value));
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            a_.OpMax(dxbc::Dest::R(eM, components), dxbc::Src::LF(0.0f),
+                     dxbc::Src::R(eM));
+            a_.OpMin(dxbc::Dest::R(eM, components), max_value_src,
+                     dxbc::Src::R(eM));
+          }
+        }
+        a_.OpElse();
+        {
+          // Unsigned normalized.
+          uint32_t scale_components = 0;
+          float scale[4] = {};
+          for (uint32_t i = 0; i < 4; ++i) {
+            if (widths[i] > 1) {
+              scale_components |= uint32_t(1) << i;
+              scale[i] = float((uint32_t(1) << widths[i]) - 1);
+            }
+          }
+          dxbc::Src scale_src(dxbc::Src::LP(scale));
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            // Saturate.
+            a_.OpMov(dxbc::Dest::R(eM, components), dxbc::Src::R(eM), true);
+            if (scale_components) {
+              a_.OpMul(dxbc::Dest::R(eM, scale_components), dxbc::Src::R(eM),
+                       scale_src);
+            }
+          }
+        }
+        a_.OpEndIf();
+
+        // Add 0.5 before truncating according to the Direct3D format conversion
+        // rules, and convert to unsigned integers.
+        eM_remaining = export_eM;
+        while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+          eM_remaining &= ~(uint8_t(1) << eM_index);
+          uint32_t eM = system_temps_memexport_data_[eM_index];
+          a_.OpAdd(dxbc::Dest::R(eM, components), dxbc::Src::R(eM),
+                   dxbc::Src::LF(0.5f));
+          a_.OpFToU(dxbc::Dest::R(eM, components), dxbc::Src::R(eM));
+        }
+      }
+      a_.OpEndIf();
+
+      // Pack into 32 bits.
+      for (uint32_t i = 0; i < 4; ++i) {
+        if (!widths[i]) {
+          continue;
+        }
+        dxbc::Src width_src(dxbc::Src::LU(widths[i]));
+        dxbc::Src offset_src(dxbc::Src::LU(offsets[i]));
+        eM_remaining = export_eM;
+        while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+          eM_remaining &= ~(uint8_t(1) << eM_index);
+          uint32_t eM = system_temps_memexport_data_[eM_index];
+          a_.OpBFI(dxbc::Dest::R(eM, 0b0001), width_src, offset_src,
+                   dxbc::Src::R(eM).Select(i),
+                   dxbc::Src::R(eM, dxbc::Src::kXXXX));
+        }
+      }
+    };
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8)));
+    // TODO(Triang3l): Investigate how input should be treated for k_8_A, k_8_B,
+    // k_8_8_8_8_A.
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_A)));
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_B)));
+    {
+      pack_8_16_32({8});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(0));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_1_5_5_5)));
+    {
+      pack_8_16_32({5, 5, 5, 1});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_5_6_5)));
+    {
+      pack_8_16_32({5, 6, 5});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_6_5_5)));
+    {
+      pack_8_16_32({5, 5, 6});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8)));
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8_A)));
+    a_.OpCase(
+        dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16)));
+    {
+      pack_8_16_32({8, 8, 8, 8});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_2_10_10_10)));
+    a_.OpCase(dxbc::Src::LU(
+        uint32_t(xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16)));
+    {
+      pack_8_16_32({10, 10, 10, 2});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8)));
+    {
+      pack_8_16_32({8, 8});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_4_4_4_4)));
+    {
+      pack_8_16_32({4, 4, 4, 4});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_10_11_11)));
+    a_.OpCase(
+        dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_10_11_11_AS_16_16_16_16)));
+    {
+      pack_8_16_32({11, 11, 10});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_11_11_10)));
+    a_.OpCase(
+        dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_11_11_10_AS_16_16_16_16)));
+    {
+      pack_8_16_32({10, 11, 11});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16)));
+    {
+      pack_8_16_32({16});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16)));
+    {
+      pack_8_16_32({16, 16});
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16)));
+    {
+      flush_nan(0b1111);
+
+      a_.OpIf(true, num_format_signed);
+      {
+        // Signed.
+        a_.OpIf(true, num_format_integer);
+        {
+          // Signed integer.
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            a_.OpMax(dxbc::Dest::R(eM), dxbc::Src::LF(float(INT16_MIN)),
+                     dxbc::Src::R(eM));
+            a_.OpMin(dxbc::Dest::R(eM), dxbc::Src::LF(float(INT16_MAX)),
+                     dxbc::Src::R(eM));
+          }
+        }
+        a_.OpElse();
+        {
+          // Signed normalized.
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            a_.OpMax(dxbc::Dest::R(eM), dxbc::Src::LF(-1.0f), dxbc::Src::R(eM));
+            a_.OpMin(dxbc::Dest::R(eM), dxbc::Src::LF(1.0f), dxbc::Src::R(eM));
+            a_.OpMul(dxbc::Dest::R(eM), dxbc::Src::R(eM),
+                     dxbc::Src::LF(float(INT16_MAX)));
+          }
+        }
+        a_.OpEndIf();
+
+        // Add plus/minus 0.5 before truncating according to the Direct3D format
+        // conversion rules, and convert to signed integers.
+        uint32_t round_bias_temp = PushSystemTemp();
+        eM_remaining = export_eM;
+        while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+          eM_remaining &= ~(uint8_t(1) << eM_index);
+          uint32_t eM = system_temps_memexport_data_[eM_index];
+          a_.OpBFI(dxbc::Dest::R(eM), dxbc::Src::LU(31), dxbc::Src::LU(0),
+                   dxbc::Src::LF(0.5f), dxbc::Src::R(eM));
+          a_.OpAdd(dxbc::Dest::R(eM), dxbc::Src::R(eM),
+                   dxbc::Src::R(round_bias_temp));
+          a_.OpFToI(dxbc::Dest::R(eM), dxbc::Src::R(eM));
+        }
+        // Release round_bias_temp.
+        PopSystemTemp();
+      }
+      a_.OpElse();
+      {
+        // Unsigned.
+        a_.OpIf(true, num_format_integer);
+        {
+          // Unsigned integer.
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            a_.OpMax(dxbc::Dest::R(eM), dxbc::Src::LF(0.0f), dxbc::Src::R(eM));
+            a_.OpMin(dxbc::Dest::R(eM), dxbc::Src::LF(float(UINT16_MAX)),
+                     dxbc::Src::R(eM));
+          }
+        }
+        a_.OpElse();
+        {
+          // Unsigned normalized.
+          eM_remaining = export_eM;
+          while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+            eM_remaining &= ~(uint8_t(1) << eM_index);
+            uint32_t eM = system_temps_memexport_data_[eM_index];
+            // Saturate.
+            a_.OpMov(dxbc::Dest::R(eM), dxbc::Src::R(eM), true);
+            a_.OpMul(dxbc::Dest::R(eM), dxbc::Src::R(eM),
+                     dxbc::Src::LF(float(UINT16_MAX)));
+          }
+        }
+        a_.OpEndIf();
+
+        // Add 0.5 before truncating according to the Direct3D format conversion
+        // rules, and convert to unsigned integers.
+        eM_remaining = export_eM;
+        while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+          eM_remaining &= ~(uint8_t(1) << eM_index);
+          uint32_t eM = system_temps_memexport_data_[eM_index];
+          a_.OpAdd(dxbc::Dest::R(eM), dxbc::Src::R(eM), dxbc::Src::LF(0.5f));
+          a_.OpFToU(dxbc::Dest::R(eM), dxbc::Src::R(eM));
+        }
+      }
+      a_.OpEndIf();
+
+      // Pack.
+      eM_remaining = export_eM;
+      while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+        eM_remaining &= ~(uint8_t(1) << eM_index);
+        uint32_t eM = system_temps_memexport_data_[eM_index];
+        a_.OpBFI(dxbc::Dest::R(eM, 0b0011), dxbc::Src::LU(16),
+                 dxbc::Src::LU(16), dxbc::Src::R(eM, 0b1101),
+                 dxbc::Src::R(eM, 0b1000));
+      }
+
+      a_.OpMov(element_size_dest, dxbc::Src::LU(3));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_FLOAT)));
+    {
+      // TODO(Triang3l): Use extended range conversion.
+      eM_remaining = export_eM;
+      while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+        eM_remaining &= ~(uint8_t(1) << eM_index);
+        uint32_t eM = system_temps_memexport_data_[eM_index];
+        a_.OpF32ToF16(dxbc::Dest::R(eM, 0b0001),
+                      dxbc::Src::R(eM, dxbc::Src::kXXXX));
+      }
+      a_.OpMov(element_size_dest, dxbc::Src::LU(1));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_FLOAT)));
+    {
+      // TODO(Triang3l): Use extended range conversion.
+      eM_remaining = export_eM;
+      while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+        eM_remaining &= ~(uint8_t(1) << eM_index);
+        uint32_t eM = system_temps_memexport_data_[eM_index];
+        a_.OpF32ToF16(dxbc::Dest::R(eM, 0b0011), dxbc::Src::R(eM));
+        a_.OpBFI(dxbc::Dest::R(eM, 0b0001), dxbc::Src::LU(16),
+                 dxbc::Src::LU(16), dxbc::Src::R(eM, dxbc::Src::kYYYY),
+                 dxbc::Src::R(eM, dxbc::Src::kXXXX));
+      }
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16_FLOAT)));
+    {
+      // TODO(Triang3l): Use extended range conversion.
+      eM_remaining = export_eM;
+      while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+        eM_remaining &= ~(uint8_t(1) << eM_index);
+        uint32_t eM = system_temps_memexport_data_[eM_index];
+        a_.OpF32ToF16(dxbc::Dest::R(eM), dxbc::Src::R(eM));
+        a_.OpBFI(dxbc::Dest::R(eM, 0b0011), dxbc::Src::LU(16),
+                 dxbc::Src::LU(16), dxbc::Src::R(eM, 0b1101),
+                 dxbc::Src::R(eM, 0b1000));
+      }
+      a_.OpMov(element_size_dest, dxbc::Src::LU(3));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_FLOAT)));
+    {
+      // Already in eM#.
+      a_.OpMov(element_size_dest, dxbc::Src::LU(2));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_FLOAT)));
+    {
+      // Already in eM#.
+      a_.OpMov(element_size_dest, dxbc::Src::LU(3));
+    }
+    a_.OpBreak();
+
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_32_32_FLOAT)));
+    {
+      // Already in eM#.
+      a_.OpMov(element_size_dest, dxbc::Src::LU(4));
+    }
+    a_.OpBreak();
+
+    a_.OpDefault();
+    a_.OpMov(element_size_dest, dxbc::Src::LU(UINT32_MAX));
+    a_.OpBreak();
+  }
+  // Close the color format switch.
+  a_.OpEndSwitch();
+
+  dxbc::Src element_size_src(dxbc::Src::R(temp, dxbc::Src::kXXXX));
+
+  // Only temp.x is used currently (for the element size log2).
+
+  // Do endian swap, using temp.y for the endianness value, and temp.z as a
+  // temporary value.
+  {
+    dxbc::Dest endian_dest(dxbc::Dest::R(temp, 0b0010));
+    dxbc::Src endian_src(dxbc::Src::R(temp, dxbc::Src::kYYYY));
+    // Extract endianness into temp.y.
+    a_.OpUBFE(endian_dest, dxbc::Src::LU(3), dxbc::Src::LU(0),
+              dxbc::Src::R(system_temp_memexport_address_, dxbc::Src::kZZZZ));
+
+    // Change 8-in-64 and 8-in-128 to 8-in-32.
+    for (uint32_t i = 0; i < 2; ++i) {
+      a_.OpIEq(dxbc::Dest::R(temp, 0b0100), endian_src,
+               dxbc::Src::LU(uint32_t(i ? xenos::Endian128::k8in128
+                                        : xenos::Endian128::k8in64)));
+      eM_remaining = export_eM;
+      while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+        eM_remaining &= ~(uint8_t(1) << eM_index);
+        uint32_t eM = system_temps_memexport_data_[eM_index];
+        a_.OpMovC(dxbc::Dest::R(eM), dxbc::Src::R(temp, dxbc::Src::kZZZZ),
+                  dxbc::Src::R(eM, i ? 0b00011011 : 0b10110001),
+                  dxbc::Src::R(eM));
+      }
+      a_.OpMovC(endian_dest, dxbc::Src::R(temp, dxbc::Src::kZZZZ),
+                dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)), endian_src);
+    }
+
+    uint32_t swap_temp = PushSystemTemp();
+    dxbc::Dest swap_temp_dest(dxbc::Dest::R(swap_temp));
+    dxbc::Src swap_temp_src(dxbc::Src::R(swap_temp));
+
+    // 8-in-16 or one half of 8-in-32.
+    a_.OpSwitch(endian_src);
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in16)));
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)));
+    eM_remaining = export_eM;
+    while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+      eM_remaining &= ~(uint8_t(1) << eM_index);
+      uint32_t eM = system_temps_memexport_data_[eM_index];
+      dxbc::Dest eM_dest(dxbc::Dest::R(eM));
+      dxbc::Src eM_src(dxbc::Src::R(eM));
+      // Temp = X0Z0.
+      a_.OpAnd(swap_temp_dest, eM_src, dxbc::Src::LU(0x00FF00FF));
+      // eM = YZW0.
+      a_.OpUShR(eM_dest, eM_src, dxbc::Src::LU(8));
+      // eM = Y0W0.
+      a_.OpAnd(eM_dest, eM_src, dxbc::Src::LU(0x00FF00FF));
+      // eM = YXWZ.
+      a_.OpUMAd(eM_dest, swap_temp_src, dxbc::Src::LU(256), eM_src);
+    }
+    a_.OpBreak();
+    a_.OpEndSwitch();
+
+    // 16-in-32 or another half of 8-in-32.
+    a_.OpSwitch(endian_src);
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)));
+    a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k16in32)));
+    eM_remaining = export_eM;
+    while (xe::bit_scan_forward(eM_remaining, &eM_index)) {
+      eM_remaining &= ~(uint8_t(1) << eM_index);
+      uint32_t eM = system_temps_memexport_data_[eM_index];
+      dxbc::Dest eM_dest(dxbc::Dest::R(eM));
+      dxbc::Src eM_src(dxbc::Src::R(eM));
+      // Temp = ZW00.
+      a_.OpUShR(swap_temp_dest, eM_src, dxbc::Src::LU(16));
+      // eM = ZWXY.
+      a_.OpBFI(eM_dest, dxbc::Src::LU(16), dxbc::Src::LU(16), eM_src,
+               swap_temp_src);
+    }
+    a_.OpBreak();
+    a_.OpEndSwitch();
+
+    // Release swap_temp.
+    PopSystemTemp();
+  }
+
+  // Extract the base index to temp.y and the index upper bound to temp.z.
+  a_.OpUBFE(dxbc::Dest::R(temp, 0b0110), dxbc::Src::LU(23), dxbc::Src::LU(0),
+            dxbc::Src::R(system_temp_memexport_address_, 0b1101 << 2));
+  dxbc::Dest eM0_address_dest(dxbc::Dest::R(temp, 0b0010));
+  dxbc::Src eM0_address_src(dxbc::Src::R(temp, dxbc::Src::kYYYY));
+  dxbc::Src index_count_src(dxbc::Src::R(temp, dxbc::Src::kZZZZ));
+
+  // Check if eM0 isn't out of bounds via temp.w - if it is, eM1...4 also are
+  // (the base index can't be negative).
+  a_.OpILT(dxbc::Dest::R(temp, 0b1000), eM0_address_src, index_count_src);
+  a_.OpIf(true, dxbc::Src::R(temp, dxbc::Src::kWWWW));
+
+  // Extract the base address to temp.w as bytes (30 lower bits to 30 upper bits
+  // with 0 below).
+  a_.OpIShL(dxbc::Dest::R(temp, 0b1000),
+            dxbc::Src::R(system_temp_memexport_address_, dxbc::Src::kXXXX),
+            dxbc::Src::LU(2));
+  dxbc::Src base_address_src(dxbc::Src::R(temp, dxbc::Src::kWWWW));
+
+  uint8_t export_eM14 = export_eM >> 1;
+  assert_zero(export_eM14 >> 4);
+  uint32_t eM14_address_temp = UINT32_MAX, store_eM14_temp = UINT32_MAX;
+  if (export_eM14) {
+    // Get eM1...4 indices and check if they're in bounds.
+    eM14_address_temp = PushSystemTemp();
+    dxbc::Dest eM14_address_dest(dxbc::Dest::R(eM14_address_temp, export_eM14));
+    dxbc::Src eM14_address_src(dxbc::Src::R(eM14_address_temp));
+    store_eM14_temp = PushSystemTemp();
+    dxbc::Dest store_eM14_dest(dxbc::Dest::R(store_eM14_temp, export_eM14));
+    dxbc::Src store_eM14_src(dxbc::Src::R(store_eM14_temp));
+    a_.OpIAdd(eM14_address_dest, eM0_address_src, dxbc::Src::LU(1, 2, 3, 4));
+    a_.OpILT(store_eM14_dest, eM14_address_src, index_count_src);
+    // Check if eM1...4 were actually written by the invocation and merge the
+    // result with store_eM14_temp.
+    uint32_t eM14_written_temp = PushSystemTemp();
+    a_.OpIBFE(dxbc::Dest::R(eM14_written_temp, export_eM14), dxbc::Src::LU(1),
+              dxbc::Src::LU(1, 2, 3, 4),
+              dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_,
+                           dxbc::Src::kYYYY));
+    a_.OpAnd(store_eM14_dest, store_eM14_src, dxbc::Src::R(eM14_written_temp));
+    // Release eM14_written_temp.
+    PopSystemTemp();
+    // Convert eM1...4 indices to global byte addresses.
+    a_.OpIShL(eM14_address_dest, eM14_address_src, element_size_src);
+    a_.OpIAdd(eM14_address_dest, base_address_src, eM14_address_src);
+  }
+  if (export_eM & 0b1) {
+    // Convert eM0 index to a global byte address if it's needed.
+    a_.OpIShL(eM0_address_dest, eM0_address_src, element_size_src);
+    a_.OpIAdd(eM0_address_dest, base_address_src, eM0_address_src);
+    // base_address_src and index_count_src are deallocated at this point (even
+    // if eM0 isn't potentially written), temp.zw are now free.
+    // Extract if eM0 was actually written by the invocation to temp.z.
+    a_.OpIBFE(dxbc::Dest::R(temp, 0b0100), dxbc::Src::LU(1), dxbc::Src::LU(0),
+              dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_,
+                           dxbc::Src::kYYYY));
+  }
+  dxbc::Src eM0_written_src(dxbc::Src::R(temp, dxbc::Src::kZZZZ));
+
+  // Write depending on the element size.
+  // No switch case will be entered for an unknown format (UINT32_MAX size
+  // written), so writing won't be attempted for it.
+  if (uav_index_shared_memory_ == kBindingIndexUnallocated) {
+    uav_index_shared_memory_ = uav_count_++;
+  }
+  uint8_t eM14_remaining;
+  uint32_t eM14_index;
+  a_.OpSwitch(element_size_src);
+
+  // 8bpp, 16bpp.
+  dxbc::Dest atomic_dest(dxbc::Dest::U(
+      uav_index_shared_memory_, uint32_t(UAVRegister::kSharedMemory), 0));
+  for (uint32_t i = 0; i <= 1; ++i) {
+    a_.OpCase(dxbc::Src::LU(i));
+    dxbc::Src width_src(dxbc::Src::LU(8 << i));
+    uint32_t sub_dword_temp = PushSystemTemp();
+    if (export_eM & 0b1) {
+      a_.OpIf(true, eM0_written_src);
+      // sub_dword_temp.x = eM0 offset in the dword (8 << (byte_address & 3))
+      // (assuming a little-endian host).
+      a_.OpBFI(dxbc::Dest::R(sub_dword_temp, 0b0001), dxbc::Src::LU(2),
+               dxbc::Src::LU(3), eM0_address_src, dxbc::Src::LU(0));
+      // Keep only the dword part of the address.
+      a_.OpAnd(eM0_address_dest, eM0_address_src, dxbc::Src::LU(~uint32_t(3)));
+      // Erase the bits that will be replaced with eM0 via sub_dword_temp.y.
+      a_.OpBFI(dxbc::Dest::R(sub_dword_temp, 0b0010), width_src,
+               dxbc::Src::R(sub_dword_temp, dxbc::Src::kXXXX), dxbc::Src::LU(0),
+               dxbc::Src::LU(UINT32_MAX));
+      a_.OpAtomicAnd(atomic_dest, eM0_address_src, 0b0001,
+                     dxbc::Src::R(sub_dword_temp, dxbc::Src::kYYYY));
+      // Add the eM0 bits via sub_dword_temp.y.
+      a_.OpBFI(dxbc::Dest::R(sub_dword_temp, 0b0010), width_src,
+               dxbc::Src::R(sub_dword_temp, dxbc::Src::kXXXX),
+               dxbc::Src::R(system_temps_memexport_data_[0], dxbc::Src::kXXXX),
+               dxbc::Src::LU(0));
+      a_.OpAtomicOr(atomic_dest, eM0_address_src, 0b0001,
+                    dxbc::Src::R(sub_dword_temp, dxbc::Src::kYYYY));
+      a_.OpEndIf();
+    }
+    if (export_eM14) {
+      // sub_dword_temp = eM# offset in the dword (8 << (byte_address & 3))
+      // (assuming a little-endian host).
+      a_.OpBFI(dxbc::Dest::R(sub_dword_temp, export_eM14), dxbc::Src::LU(2),
+               dxbc::Src::LU(3), dxbc::Src::R(eM14_address_temp),
+               dxbc::Src::LU(0));
+      // Keep only the dword part of the address.
+      a_.OpAnd(dxbc::Dest::R(eM14_address_temp, export_eM14),
+               dxbc::Src::R(eM14_address_temp), dxbc::Src::LU(~uint32_t(3)));
+      uint32_t sub_dword_data_temp = PushSystemTemp();
+      eM14_remaining = export_eM14;
+      while (xe::bit_scan_forward(eM14_remaining, &eM14_index)) {
+        eM14_remaining &= ~(uint8_t(1) << eM14_index);
+        a_.OpIf(true, dxbc::Src::R(store_eM14_temp).Select(eM14_index));
+        // Erase the bits that will be replaced with eM# via
+        // sub_dword_data_temp.x.
+        a_.OpBFI(dxbc::Dest::R(sub_dword_data_temp, 0b0001), width_src,
+                 dxbc::Src::R(sub_dword_temp).Select(eM14_index),
+                 dxbc::Src::LU(0), dxbc::Src::LU(UINT32_MAX));
+        a_.OpAtomicAnd(
+            atomic_dest, dxbc::Src::R(eM14_address_temp).Select(eM14_index),
+            0b0001, dxbc::Src::R(sub_dword_data_temp, dxbc::Src::kXXXX));
+        // Add the eM# bits via sub_dword_temp.y.
+        a_.OpBFI(dxbc::Dest::R(sub_dword_data_temp, 0b0001), width_src,
+                 dxbc::Src::R(sub_dword_temp).Select(eM14_index),
+                 dxbc::Src::R(system_temps_memexport_data_[1 + eM14_index],
+                              dxbc::Src::kXXXX),
+                 dxbc::Src::LU(0));
+        a_.OpAtomicOr(
+            atomic_dest, dxbc::Src::R(eM14_address_temp).Select(eM14_index),
+            0b0001, dxbc::Src::R(sub_dword_data_temp, dxbc::Src::kXXXX));
+        a_.OpEndIf();
+      }
+      // Release sub_dword_data_temp.
+      PopSystemTemp();
+    }
+    // Release sub_dword_temp.
+    PopSystemTemp();
+    a_.OpBreak();
+  }
+
+  // 32bpp, 64bpp, 128bpp.
+  for (uint32_t i = 2; i <= 4; ++i) {
+    a_.OpCase(dxbc::Src::LU(i));
+    // Store (0b0001), Store2 (0b0011), Store4 (0b1111).
+    uint32_t store_mask = (uint32_t(1) << (uint32_t(1) << (i - 2))) - 1;
+    dxbc::Dest store_dest(dxbc::Dest::U(uav_index_shared_memory_,
+                                        uint32_t(UAVRegister::kSharedMemory),
+                                        store_mask));
+    if (export_eM & 0b1) {
+      a_.OpIf(true, eM0_written_src);
+      a_.OpStoreRaw(store_dest, eM0_address_src,
+                    dxbc::Src::R(system_temps_memexport_data_[0]));
+      a_.OpEndIf();
+    }
+    eM14_remaining = export_eM14;
+    while (xe::bit_scan_forward(eM14_remaining, &eM14_index)) {
+      eM14_remaining &= ~(uint8_t(1) << eM14_index);
+      a_.OpIf(true, dxbc::Src::R(store_eM14_temp).Select(eM14_index));
+      a_.OpStoreRaw(store_dest,
+                    dxbc::Src::R(eM14_address_temp).Select(eM14_index),
+                    dxbc::Src::R(system_temps_memexport_data_[1 + eM14_index]));
+      a_.OpEndIf();
+    }
+    a_.OpBreak();
+  }
+
+  // Close the element size switch.
+  a_.OpEndSwitch();
+
+  if (export_eM14) {
+    // Release eM14_address_temp and store_eM14_temp.
+    PopSystemTemp(2);
+  }
+
+  // Close the eM0 bounds check.
   a_.OpEndIf();
 
-  // Release control_temp.
+  // Release temp.
   PopSystemTemp();
+
+  // Close the address correctness conditional.
+  a_.OpEndIf();
+
+  // Close the memory export allowed conditional.
+  a_.OpEndIf();
 }
 
 }  // namespace gpu
diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h
index f07c0ee97..6ac4f568e 100644
--- a/src/xenia/gpu/shader.h
+++ b/src/xenia/gpu/shader.h
@@ -673,7 +673,7 @@ class Shader {
     // For implementation without unconditional support for memory writes from
     // vertex shaders, vertex shader converted to a compute shader doing only
     // memory export.
-    kMemexportCompute,
+    kMemExportCompute,
 
     // 4 host vertices for 1 guest vertex, for implementations without
     // unconditional geometry shader support.
@@ -770,9 +770,16 @@ class Shader {
     }
   };
 
-  // Based on the number of AS_VS/PS_EXPORT_STREAM_* enum sets found in a game
-  // .pdb.
-  static constexpr uint32_t kMaxMemExports = 16;
+  struct ControlFlowMemExportInfo {
+    // Which eM elements have potentially (regardless of conditionals, loop
+    // iteration counts, predication) been written earlier in the predecessor
+    // graph of the instruction since an `alloc export`.
+    uint8_t eM_potentially_written_before = 0;
+    // For exec sequences, which eM elements are potentially (regardless of
+    // predication) written by the instructions in the sequence. For other
+    // control flow instructions, it's 0.
+    uint8_t eM_potentially_written_by_exec = 0;
+  };
 
   class Translation {
    public:
@@ -880,19 +887,21 @@ class Shader {
     return constant_register_map_;
   }
 
-  // uint5[Shader::kMaxMemExports] - bits indicating which eM# registers have
-  // been written to after each `alloc export`, for up to Shader::kMaxMemExports
-  // exports. This will contain zero for certain corrupt exports - for those to
-  // which a valid eA was not written via a MAD with a stream constant.
-  const uint8_t* memexport_eM_written() const { return memexport_eM_written_; }
+  // Information about memory export state at each control flow instruction. May
+  // be empty if there are no eM# writes.
+  const std::vector<ControlFlowMemExportInfo>& cf_memexport_info() const {
+    return cf_memexport_info_;
+  }
 
-  // All c# registers used as the addend in MAD operations to eA.
+  uint8_t memexport_eM_written() const { return memexport_eM_written_; }
+  uint8_t memexport_eM_potentially_written_before_end() const {
+    return memexport_eM_potentially_written_before_end_;
+  }
+
+  // c# registers used as the addend in MAD operations to eA.
   const std::set<uint32_t>& memexport_stream_constants() const {
     return memexport_stream_constants_;
   }
-  bool is_valid_memexport_used() const {
-    return !memexport_stream_constants_.empty();
-  }
 
   // Labels that jumps (explicit or from loops) can be done to.
   const std::set<uint32_t>& label_addresses() const { return label_addresses_; }
@@ -970,7 +979,7 @@ class Shader {
     // TODO(Triang3l): Investigate what happens to memexport when the pixel
     // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early
     // depth/stencil.
-    return !kills_pixels() && !writes_depth() && !is_valid_memexport_used();
+    return !kills_pixels() && !writes_depth() && !memexport_eM_written();
   }
 
   // Whether each color render target is written to on any execution path.
@@ -1042,8 +1051,6 @@ class Shader {
   std::vector<VertexBinding> vertex_bindings_;
   std::vector<TextureBinding> texture_bindings_;
   ConstantRegisterMap constant_register_map_ = {0};
-  uint8_t memexport_eM_written_[kMaxMemExports] = {};
-  std::set<uint32_t> memexport_stream_constants_;
   std::set<uint32_t> label_addresses_;
   uint32_t cf_pair_index_bound_ = 0;
   uint32_t register_static_address_bound_ = 0;
@@ -1055,6 +1062,17 @@ class Shader {
   bool uses_texture_fetch_instruction_results_ = false;
   bool writes_depth_ = false;
 
+  // Memory export eM write info for each control flow instruction, if there are
+  // any eM writes in the shader.
+  std::vector<ControlFlowMemExportInfo> cf_memexport_info_;
+  // Which memexport elements (eM#) are written for any memexport in the shader.
+  uint8_t memexport_eM_written_ = 0;
+  // ControlFlowMemExportInfo::eM_potentially_written_before equivalent for the
+  // end of the shader, for the last memory export (or exports if the end has
+  // multiple predecessor chains exporting to memory).
+  uint8_t memexport_eM_potentially_written_before_end_ = 0;
+  std::set<uint32_t> memexport_stream_constants_;
+
   // Modification bits -> translation.
   std::unordered_map<uint64_t, Translation*> translations_;
 
@@ -1064,8 +1082,7 @@ class Shader {
   void GatherExecInformation(
       const ParsedExecInstruction& instr,
       ucode::VertexFetchInstruction& previous_vfetch_full,
-      uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
-      uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer);
+      uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer);
   void GatherVertexFetchInformation(
       const ucode::VertexFetchInstruction& op,
       ucode::VertexFetchInstruction& previous_vfetch_full,
@@ -1074,13 +1091,12 @@ class Shader {
                                      uint32_t& unique_texture_bindings,
                                      StringBuffer& ucode_disasm_buffer);
   void GatherAluInstructionInformation(const ucode::AluInstruction& op,
-                                       uint32_t memexport_alloc_current_count,
-                                       uint32_t& memexport_eA_written,
+                                       uint32_t exec_cf_index,
                                        StringBuffer& ucode_disasm_buffer);
   void GatherOperandInformation(const InstructionOperand& operand);
   void GatherFetchResultInformation(const InstructionResult& result);
   void GatherAluResultInformation(const InstructionResult& result,
-                                  uint32_t memexport_alloc_current_count);
+                                  uint32_t exec_cf_index);
 };
 
 }  // namespace gpu
diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index dc38a42b8..10509f743 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -14,6 +14,7 @@
 #include <cstring>
 #include <set>
 #include <string>
+#include <utility>
 
 #include "xenia/base/assert.h"
 #include "xenia/base/logging.h"
@@ -93,8 +94,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
   VertexFetchInstruction previous_vfetch_full;
   std::memset(&previous_vfetch_full, 0, sizeof(previous_vfetch_full));
   uint32_t unique_texture_bindings = 0;
-  uint32_t memexport_alloc_count = 0;
-  uint32_t memexport_eA_written = 0;
   for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
     ControlFlowInstruction cf_ab[2];
     UnpackControlFlowInstructions(ucode_data_.data() + i * 3, cf_ab);
@@ -117,8 +116,7 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
           ParsedExecInstruction instr;
           ParseControlFlowExec(cf.exec, cf_index, instr);
           GatherExecInformation(instr, previous_vfetch_full,
-                                unique_texture_bindings, memexport_alloc_count,
-                                memexport_eA_written, ucode_disasm_buffer);
+                                unique_texture_bindings, ucode_disasm_buffer);
         } break;
         case ControlFlowOpcode::kCondExec:
         case ControlFlowOpcode::kCondExecEnd:
@@ -128,16 +126,14 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
           ParsedExecInstruction instr;
           ParseControlFlowCondExec(cf.cond_exec, cf_index, instr);
           GatherExecInformation(instr, previous_vfetch_full,
-                                unique_texture_bindings, memexport_alloc_count,
-                                memexport_eA_written, ucode_disasm_buffer);
+                                unique_texture_bindings, ucode_disasm_buffer);
         } break;
         case ControlFlowOpcode::kCondExecPred:
         case ControlFlowOpcode::kCondExecPredEnd: {
           ParsedExecInstruction instr;
           ParseControlFlowCondExecPred(cf.cond_exec_pred, cf_index, instr);
           GatherExecInformation(instr, previous_vfetch_full,
-                                unique_texture_bindings, memexport_alloc_count,
-                                memexport_eA_written, ucode_disasm_buffer);
+                                unique_texture_bindings, ucode_disasm_buffer);
         } break;
         case ControlFlowOpcode::kLoopStart: {
           ParsedLoopStartInstruction instr;
@@ -179,9 +175,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
           ParseControlFlowAlloc(cf.alloc, cf_index,
                                 type() == xenos::ShaderType::kVertex, instr);
           instr.Disassemble(&ucode_disasm_buffer);
-          if (instr.type == AllocType::kMemory) {
-            ++memexport_alloc_count;
-          }
         } break;
         case ControlFlowOpcode::kMarkVsFetchDone:
           break;
@@ -212,17 +205,125 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
     }
   }
 
-  // Cleanup invalid/unneeded memexport allocs.
-  for (uint32_t i = 0; i < kMaxMemExports; ++i) {
-    if (!(memexport_eA_written & (uint32_t(1) << i))) {
-      memexport_eM_written_[i] = 0;
-    } else if (!memexport_eM_written_[i]) {
-      memexport_eA_written &= ~(uint32_t(1) << i);
+  if (!cf_memexport_info_.empty()) {
+    // Gather potentially "dirty" memexport elements before each control flow
+    // instruction. `alloc` (any, not only `export`) flushes the previous memory
+    // export. On the guest GPU, yielding / serializing also terminates memory
+    // exports, but for simplicity disregarding that, as that functionally does
+    // nothing compared to flushing the previous memory export only at `alloc`
+    // or even only specifically at `alloc export`, Microsoft's validator checks
+    // if eM# aren't written after a `serialize`.
+    std::vector<uint32_t> successor_stack;
+    for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
+      ControlFlowInstruction eM_writing_cf_ab[2];
+      UnpackControlFlowInstructions(ucode_data_.data() + i * 3,
+                                    eM_writing_cf_ab);
+      for (uint32_t j = 0; j < 2; ++j) {
+        uint32_t eM_writing_cf_index = i * 2 + j;
+        uint32_t eM_written_by_cf_instr =
+            cf_memexport_info_[eM_writing_cf_index]
+                .eM_potentially_written_by_exec;
+        if (eM_writing_cf_ab[j].opcode() == ControlFlowOpcode::kCondCall) {
+          // Until subroutine calls are handled accurately, assume that all eM#
+          // have potentially been written by the subroutine for simplicity.
+          eM_written_by_cf_instr = memexport_eM_written_;
+        }
+        if (!eM_written_by_cf_instr) {
+          continue;
+        }
+
+        // If the control flow instruction potentially results in any eM# being
+        // written, mark those eM# as potentially written before each successor.
+        bool is_successor_graph_head = true;
+        successor_stack.push_back(eM_writing_cf_index);
+        while (!successor_stack.empty()) {
+          uint32_t successor_cf_index = successor_stack.back();
+          successor_stack.pop_back();
+
+          ControlFlowMemExportInfo& successor_memexport_info =
+              cf_memexport_info_[successor_cf_index];
+          if ((successor_memexport_info.eM_potentially_written_before &
+               eM_written_by_cf_instr) == eM_written_by_cf_instr) {
+            // Already marked as written before this instruction (and thus
+            // before all its successors too). Possibly this instruction is in a
+            // loop, in this case an instruction may succeed itself.
+            break;
+          }
+          // The first instruction in the traversal is the writing instruction
+          // itself, not its successor. However, if it has been visited by the
+          // traversal twice, it's in a loop, so it succeeds itself, and thus
+          // writes from it are potentially done before it too.
+          if (!is_successor_graph_head) {
+            successor_memexport_info.eM_potentially_written_before |=
+                eM_written_by_cf_instr;
+          }
+          is_successor_graph_head = false;
+
+          ControlFlowInstruction successor_cf_ab[2];
+          UnpackControlFlowInstructions(
+              ucode_data_.data() + (successor_cf_index >> 1) * 3,
+              successor_cf_ab);
+          const ControlFlowInstruction& successor_cf =
+              successor_cf_ab[successor_cf_index & 1];
+
+          bool next_instr_is_new_successor = true;
+          switch (successor_cf.opcode()) {
+            case ControlFlowOpcode::kExecEnd:
+              // One successor: end.
+              memexport_eM_potentially_written_before_end_ |=
+                  eM_written_by_cf_instr;
+              next_instr_is_new_successor = false;
+              break;
+            case ControlFlowOpcode::kCondExecEnd:
+            case ControlFlowOpcode::kCondExecPredEnd:
+            case ControlFlowOpcode::kCondExecPredCleanEnd:
+              // Two successors: next, end.
+              memexport_eM_potentially_written_before_end_ |=
+                  eM_written_by_cf_instr;
+              break;
+            case ControlFlowOpcode::kLoopStart:
+              // Two successors: next, skip.
+              successor_stack.push_back(successor_cf.loop_start.address());
+              break;
+            case ControlFlowOpcode::kLoopEnd:
+              // Two successors: next, repeat.
+              successor_stack.push_back(successor_cf.loop_end.address());
+              break;
+            case ControlFlowOpcode::kCondCall:
+              // Two successors: next, target.
+              successor_stack.push_back(successor_cf.cond_call.address());
+              break;
+            case ControlFlowOpcode::kReturn:
+              // Currently treating all subroutine calls as potentially writing
+              // all eM# for simplicity, so just exit the subroutine.
+              next_instr_is_new_successor = false;
+              break;
+            case ControlFlowOpcode::kCondJmp:
+              // One or two successors: next if conditional, target.
+              successor_stack.push_back(successor_cf.cond_jmp.address());
+              if (successor_cf.cond_jmp.is_unconditional()) {
+                next_instr_is_new_successor = false;
+              }
+              break;
+            case ControlFlowOpcode::kAlloc:
+              // Any `alloc` ends the previous export.
+              next_instr_is_new_successor = false;
+              break;
+            default:
+              break;
+          }
+          if (next_instr_is_new_successor) {
+            if (successor_cf_index < (cf_pair_index_bound_ << 1)) {
+              successor_stack.push_back(successor_cf_index + 1);
+            } else {
+              memexport_eM_potentially_written_before_end_ |=
+                  eM_written_by_cf_instr;
+            }
+          }
+        }
+      }
     }
   }
-  if (memexport_eA_written == 0) {
-    memexport_stream_constants_.clear();
-  }
 
   is_ucode_analyzed_ = true;
 
@@ -256,8 +357,7 @@ uint32_t Shader::GetInterpolatorInputMask(reg::SQ_PROGRAM_CNTL sq_program_cntl,
 void Shader::GatherExecInformation(
     const ParsedExecInstruction& instr,
     ucode::VertexFetchInstruction& previous_vfetch_full,
-    uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
-    uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
+    uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer) {
   instr.Disassemble(&ucode_disasm_buffer);
   uint32_t sequence = instr.sequence;
   for (uint32_t instr_offset = instr.instruction_address;
@@ -279,8 +379,7 @@ void Shader::GatherExecInformation(
       }
     } else {
       auto& op = *reinterpret_cast<const AluInstruction*>(op_ptr);
-      GatherAluInstructionInformation(op, memexport_alloc_current_count,
-                                      memexport_eA_written,
+      GatherAluInstructionInformation(op, instr.dword_index,
                                       ucode_disasm_buffer);
     }
   }
@@ -388,8 +487,8 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op,
 }
 
 void Shader::GatherAluInstructionInformation(
-    const AluInstruction& op, uint32_t memexport_alloc_current_count,
-    uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
+    const AluInstruction& op, uint32_t exec_cf_index,
+    StringBuffer& ucode_disasm_buffer) {
   ParsedAluInstruction instr;
   ParseAluInstruction(op, type(), instr);
   instr.Disassemble(&ucode_disasm_buffer);
@@ -401,10 +500,8 @@ void Shader::GatherAluInstructionInformation(
       (ucode::GetAluScalarOpcodeInfo(op.scalar_opcode()).changed_state &
        ucode::kAluOpChangedStatePixelKill);
 
-  GatherAluResultInformation(instr.vector_and_constant_result,
-                             memexport_alloc_current_count);
-  GatherAluResultInformation(instr.scalar_result,
-                             memexport_alloc_current_count);
+  GatherAluResultInformation(instr.vector_and_constant_result, exec_cf_index);
+  GatherAluResultInformation(instr.scalar_result, exec_cf_index);
   for (size_t i = 0; i < instr.vector_operand_count; ++i) {
     GatherOperandInformation(instr.vector_operands[i]);
   }
@@ -412,9 +509,7 @@ void Shader::GatherAluInstructionInformation(
     GatherOperandInformation(instr.scalar_operands[i]);
   }
 
-  // Store used memexport constants because CPU code needs addresses and sizes,
-  // and also whether there have been writes to eA and eM# for register
-  // allocation in shader translator implementations.
+  // Store used memexport constants because CPU code needs addresses and sizes.
   // eA is (hopefully) always written to using:
   // mad eA, r#, const0100, c#
   // (though there are some exceptions, shaders in 4D5307E6 for some reason set
@@ -423,13 +518,9 @@ void Shader::GatherAluInstructionInformation(
   // Export is done to vector_dest of the ucode instruction for both vector and
   // scalar operations - no need to check separately.
   if (instr.vector_and_constant_result.storage_target ==
-          InstructionStorageTarget::kExportAddress &&
-      memexport_alloc_current_count > 0 &&
-      memexport_alloc_current_count <= Shader::kMaxMemExports) {
+      InstructionStorageTarget::kExportAddress) {
     uint32_t memexport_stream_constant = instr.GetMemExportStreamConstant();
     if (memexport_stream_constant != UINT32_MAX) {
-      memexport_eA_written |= uint32_t(1)
-                              << (memexport_alloc_current_count - 1);
       memexport_stream_constants_.insert(memexport_stream_constant);
     } else {
       XELOGE(
@@ -488,8 +579,8 @@ void Shader::GatherFetchResultInformation(const InstructionResult& result) {
   }
 }
 
-void Shader::GatherAluResultInformation(
-    const InstructionResult& result, uint32_t memexport_alloc_current_count) {
+void Shader::GatherAluResultInformation(const InstructionResult& result,
+                                        uint32_t exec_cf_index) {
   uint32_t used_write_mask = result.GetUsedWriteMask();
   if (!used_write_mask) {
     return;
@@ -511,11 +602,12 @@ void Shader::GatherAluResultInformation(
       writes_point_size_edge_flag_kill_vertex_ |= used_write_mask;
       break;
     case InstructionStorageTarget::kExportData:
-      if (memexport_alloc_current_count > 0 &&
-          memexport_alloc_current_count <= Shader::kMaxMemExports) {
-        memexport_eM_written_[memexport_alloc_current_count - 1] |=
-            uint32_t(1) << result.storage_index;
+      memexport_eM_written_ |= uint8_t(1) << result.storage_index;
+      if (cf_memexport_info_.empty()) {
+        cf_memexport_info_.resize(2 * cf_pair_index_bound_);
       }
+      cf_memexport_info_[exec_cf_index].eM_potentially_written_by_exec |=
+          uint32_t(1) << result.storage_index;
       break;
     case InstructionStorageTarget::kColor:
       writes_color_targets_ |= uint32_t(1) << result.storage_index;
@@ -672,7 +764,13 @@ void ShaderTranslator::TranslateControlFlowInstruction(
     case ControlFlowOpcode::kAlloc: {
       ParsedAllocInstruction instr;
       ParseControlFlowAlloc(cf.alloc, cf_index_, is_vertex_shader(), instr);
-      ProcessAllocInstruction(instr);
+      const std::vector<Shader::ControlFlowMemExportInfo>& cf_memexport_info =
+          current_shader().cf_memexport_info();
+      ProcessAllocInstruction(instr,
+                              instr.dword_index < cf_memexport_info.size()
+                                  ? cf_memexport_info[instr.dword_index]
+                                        .eM_potentially_written_before
+                                  : 0);
     } break;
     case ControlFlowOpcode::kMarkVsFetchDone:
       break;
@@ -814,6 +912,14 @@ void ParseControlFlowAlloc(const ControlFlowAllocInstruction& cf,
 void ShaderTranslator::TranslateExecInstructions(
     const ParsedExecInstruction& instr) {
   ProcessExecInstructionBegin(instr);
+
+  const std::vector<Shader::ControlFlowMemExportInfo>& cf_memexport_info =
+      current_shader().cf_memexport_info();
+  uint8_t eM_potentially_written_before =
+      instr.dword_index < cf_memexport_info.size()
+          ? cf_memexport_info[instr.dword_index].eM_potentially_written_before
+          : 0;
+
   const uint32_t* ucode_dwords = current_shader().ucode_data().data();
   uint32_t sequence = instr.sequence;
   for (uint32_t instr_offset = instr.instruction_address;
@@ -839,9 +945,22 @@ void ShaderTranslator::TranslateExecInstructions(
       auto& op = *reinterpret_cast<const AluInstruction*>(op_ptr);
       ParsedAluInstruction alu_instr;
       ParseAluInstruction(op, current_shader().type(), alu_instr);
-      ProcessAluInstruction(alu_instr);
+      ProcessAluInstruction(alu_instr, eM_potentially_written_before);
+      if (alu_instr.vector_and_constant_result.storage_target ==
+              InstructionStorageTarget::kExportData &&
+          alu_instr.vector_and_constant_result.GetUsedWriteMask()) {
+        eM_potentially_written_before |=
+            uint8_t(1) << alu_instr.vector_and_constant_result.storage_index;
+      }
+      if (alu_instr.scalar_result.storage_target ==
+              InstructionStorageTarget::kExportData &&
+          alu_instr.scalar_result.GetUsedWriteMask()) {
+        eM_potentially_written_before |=
+            uint8_t(1) << alu_instr.scalar_result.storage_index;
+      }
     }
   }
+
   ProcessExecInstructionEnd(instr);
 }
 
diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h
index 593907952..5cca60c33 100644
--- a/src/xenia/gpu/shader_translator.h
+++ b/src/xenia/gpu/shader_translator.h
@@ -127,8 +127,10 @@ class ShaderTranslator {
   virtual void ProcessReturnInstruction(const ParsedReturnInstruction& instr) {}
   // Handles translation for jump instructions.
   virtual void ProcessJumpInstruction(const ParsedJumpInstruction& instr) {}
-  // Handles translation for alloc instructions.
-  virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr) {}
+  // Handles translation for alloc instructions. Memory exports for eM#
+  // indicated by export_eM must be performed, regardless of the alloc type.
+  virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr,
+                                       uint8_t export_eM) {}
 
   // Handles translation for vertex fetch instructions.
   virtual void ProcessVertexFetchInstruction(
@@ -137,7 +139,13 @@ class ShaderTranslator {
   virtual void ProcessTextureFetchInstruction(
       const ParsedTextureFetchInstruction& instr) {}
   // Handles translation for ALU instructions.
-  virtual void ProcessAluInstruction(const ParsedAluInstruction& instr) {}
+  // memexport_eM_potentially_written_before needs to be handled by `kill`
+  // instruction to make sure memory exports for the eM# writes earlier in
+  // previous execs and the current exec are done before the invocation becomes
+  // inactive.
+  virtual void ProcessAluInstruction(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before) {}
 
  private:
   void TranslateControlFlowInstruction(const ucode::ControlFlowInstruction& cf);
diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index b7da0678d..9889fb630 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -134,7 +134,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
     // (32-bit only - 16-bit indices are always fetched via the Vulkan index
     // buffer).
     kSysFlag_VertexIndexLoad = 1u << kSysFlag_VertexIndexLoad_Shift,
-    // For HostVertexShaderTypes kMemexportCompute, kPointListAsTriangleStrip,
+    // For HostVertexShaderTypes kMemExportCompute, kPointListAsTriangleStrip,
     // kRectangleListAsTriangleStrip, whether the vertex index needs to be
     // loaded from the index buffer (rather than using autogenerated indices),
     // and whether it's 32-bit. This is separate from kSysFlag_VertexIndexLoad
@@ -427,7 +427,9 @@ class SpirvShaderTranslator : public ShaderTranslator {
       const ParsedVertexFetchInstruction& instr) override;
   void ProcessTextureFetchInstruction(
       const ParsedTextureFetchInstruction& instr) override;
-  void ProcessAluInstruction(const ParsedAluInstruction& instr) override;
+  void ProcessAluInstruction(
+      const ParsedAluInstruction& instr,
+      uint8_t memexport_eM_potentially_written_before) override;
 
  private:
   struct TextureBinding {
@@ -620,7 +622,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
     assert_true(edram_fragment_shader_interlock_);
     return !is_depth_only_fragment_shader_ &&
            !current_shader().writes_depth() &&
-           !current_shader().is_valid_memexport_used();
+           !current_shader().memexport_eM_written();
   }
   void FSI_LoadSampleMask(spv::Id msaa_samples);
   void FSI_LoadEdramOffsets(spv::Id msaa_samples);
diff --git a/src/xenia/gpu/spirv_shader_translator_alu.cc b/src/xenia/gpu/spirv_shader_translator_alu.cc
index 47978dd00..05e41d5ab 100644
--- a/src/xenia/gpu/spirv_shader_translator_alu.cc
+++ b/src/xenia/gpu/spirv_shader_translator_alu.cc
@@ -67,7 +67,8 @@ void SpirvShaderTranslator::KillPixel(spv::Id condition) {
 }
 
 void SpirvShaderTranslator::ProcessAluInstruction(
-    const ParsedAluInstruction& instr) {
+    const ParsedAluInstruction& instr,
+    uint8_t memexport_eM_potentially_written_before) {
   if (instr.IsNop()) {
     // Don't even disassemble or update predication.
     return;
diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h
index 3b0875e25..83719b7a9 100644
--- a/src/xenia/gpu/ucode.h
+++ b/src/xenia/gpu/ucode.h
@@ -215,7 +215,7 @@ enum class AllocType : uint32_t {
   kVsInterpolators = 2,
   // Pixel shader exports colors.
   kPsColors = 2,
-  // MEMEXPORT?
+  // Memory export.
   kMemory = 3,
 };
 
@@ -1787,6 +1787,9 @@ inline uint32_t GetAluVectorOpNeededSourceComponents(
                           .operand_components_used[src_index - 1];
 }
 
+// eM# (kExportData) register count.
+constexpr uint32_t kMaxMemExportElementCount = 5;
+
 enum class ExportRegister : uint32_t {
   kVSInterpolator0 = 0,
   kVSInterpolator1,
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 8df4af859..7136505b2 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -2175,7 +2175,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
     return false;
   }
   pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
-  bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
+  bool memexport_used_vertex = vertex_shader->memexport_eM_written();
 
   // Pixel shader analysis.
   bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h
index 55a16df1d..396056bfe 100644
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@@ -456,6 +456,18 @@ enum class TextureFormat : uint32_t {
   k_6_5_5 = 5,
   k_8_8_8_8 = 6,
   k_2_10_10_10 = 7,
+  // Possibly similar to k_8, but may be storing alpha instead of red when
+  // resolving/memexporting, though not exactly known. From the point of view of
+  // sampling, it should be treated the same as k_8 (given that textures have
+  // the last - and single-component textures have the only - component
+  // replicated into all the remaining ones before the swizzle).
+  // Used as:
+  // - Texture in 4B4E083C - text, starting from the "Loading..." and the "This
+  //   game saves data automatically" messages. The swizzle in the fetch
+  //   constant is 111W (suggesting that internally the only component may be
+  //   the alpha one, not red).
+  // TODO(Triang3l): Investigate how k_8_A and k_8_B work in resolves and
+  // memexports, whether they store alpha/blue of the input or red.
   k_8_A = 8,
   k_8_B = 9,
   k_8_8 = 10,
@@ -469,6 +481,12 @@ enum class TextureFormat : uint32_t {
   // Used for videos in 54540829.
   k_Y1_Cr_Y0_Cb_REP = 12,
   k_16_16_EDRAM = 13,
+  // Likely same as k_8_8_8_8.
+  // Used as:
+  // - Memexport destination in 4D5308BC - multiple small draws when looking
+  //   back at the door behind the player in the first room of gameplay.
+  // - Memexport destination in 4D53085B and 4D530919 - in 4D53085B, in a frame
+  //   between the intro video and the main menu, in a 8192-point draw.
   k_8_8_8_8_A = 14,
   k_4_4_4_4 = 15,
   k_10_11_11 = 16,
@@ -1326,8 +1344,7 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
 // memexport on the Adreno 2xx using GL_OES_get_program_binary - it's also
 // interesting to see how alphatest interacts with it, whether it's still true
 // fixed-function alphatest, as it's claimed to be supported as usual by the
-// extension specification - it's likely, however, that memory exports are
-// discarded alongside other exports such as oC# and oDepth this way.
+// extension specification.
 //
 // Y of eA contains the offset in elements - this is what shaders are supposed
 // to calculate from something like the vertex index. Again, it's specified as
@@ -1350,6 +1367,69 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
 // elements using packing via addition to 2^23, so this field also doesn't need
 // more bits than that.
 //
+// According to the sequencer specification from IPR2015-00325 (where memexport
+// is called "pass thru export"):
+// - Pass thru exports can occur anywhere in the shader program.
+// - There can be any number of pass thru exports.
+// - The address register is not kept across clause boundaries, so it must be
+//   refreshed after any Serialize (or yield), allocate instruction or resource
+//   change.
+// - The write to eM# may be predicated if the export is not needed.
+// - Exports are dropped if:
+//   - The index is above the maximum.
+//   - The index sign bit is 1.
+//   - The exponent of the index is not 23.
+// The requirement that eM4 must be written if any eM# other than eM0 is also
+// written doesn't apply to the final Xenos, it's likely an outdated note in the
+// specification considering that it's very preliminary.
+//
+// According to Microsoft's shader validator:
+// - eA can be written only by `mad`.
+// - A single eM# can be written by any number of instruction, including with
+//   write masking.
+// - eA must be written before eM#.
+// - Any alloc instruction or a `serialize` terminates the current memory
+//   export. This doesn't apply to `exec Yield=true`, however, and it's not
+//   clear if that's an oversight or if that's not considered a yield that
+//   terminates the export.
+//
+// From the emulation perspective, this means that:
+// - Alloc instructions (`alloc export` mandatorily, other allocs optionally),
+//   and optionally `serialize` instructions within `exec`, should be treated as
+//   the locations where the currently open export should be flushed to the
+//   memory. It should be taken into account that an export may be in looping
+//   control flow, and in this case it must be performed at every iteration.
+// - Whether each eM# was written to must be tracked at shader execution time,
+//   as predication can disable the export of an element.
+//
+// TODO(Triang3l): Investigate how memory export interacts with pixel killing.
+// Given that eM# writes disabled by predication don't cause an export, it's
+// possible that killed invocations are treated as inactive (invalid in Xenos
+// terms) overall, and thus new memory exports from them shouldn't be done, but
+// that's not verified. However, given that on Direct3D 11+, OpenGL and Vulkan
+// hosts, discarding disables subsequent storage resource writes, on the host,
+// it would be natural to perform all outstanding memory exports before
+// discarding if the kill condition passes.
+//
+// Memory exports can be performed to any ColorFormat, including 8bpp and 16bpp
+// ones. Hosts, however, may have the memory bound as a 32bpp buffer (for
+// instance, due to the minimum resource view size limitation on Direct3D 11).
+// In this case, bytes and shorts aren't addressable directly. However, taking
+// into account that memory accesses are coherent within one shader invocation
+// on Direct3D 11+, OpenGL and Vulkan and thus are done in order relatively to
+// each other, it should be possible to implement them by clearing the bits via
+// an atomic AND, and writing the new value using an atomic OR. This will, of
+// course, make the entire write operation non-atomic, and in case of a race
+// between writes to the same location, the final result may not even be just a
+// value from one of the invocations, but rather, it can be OR of the values
+// from any invocations involved. However, on the Xenos, there doesn't seem to
+// be any possibility of meaningfully accessing the same location from multiple
+// invocations if any of them is writing, memory exports are out-of-order, so
+// such an implementation shouldn't be causing issues in reality. Atomic
+// compare-exchange, however, should not be used for this purpose, as it may
+// result in an infinite loop if different invocations want to write different
+// values to the same memory location.
+//
 // Examples of setup in titles (Z from MSB to LSB):
 //
 // 4D5307E6 particles (different VS invocation counts, like 1, 2, 4):
@@ -1385,6 +1465,11 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
 // c0: Z = 010010110000|0|010|11|011010|00011|001
 //   8in16, 16_16_16_16, uint, RGBA - from 16_16_16_16 uint vfetch
 //   (16_16_16_16 is the largest color format without special values)
+//
+// 58410B86 hierarchical depth buffer occlusion culling with the result read on
+// the CPU (15000 VS invocations in the main menu):
+// c8: Z = 010010110000|0|010|00|000010|00000|000, count = invocation count
+//   No endian swap, 8, uint, RGBA
 union alignas(uint32_t) xe_gpu_memexport_stream_t {
   struct {
     uint32_t dword_0;

From 0e81293b02457f43702157c5b6de054644ab0797 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Fri, 5 May 2023 21:27:42 +0300
Subject: [PATCH 02/14] [GPU] Remove a dangerous comment about break after
 exece [ci skip]

There can be jumps across an exece, so the code beyond it may still be
executed.
---
 src/xenia/gpu/shader_translator.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc
index 10509f743..a9561d1cf 100644
--- a/src/xenia/gpu/shader_translator.cc
+++ b/src/xenia/gpu/shader_translator.cc
@@ -186,7 +186,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
         constant_register_map_.bool_bitmap[bool_constant_index / 32] |=
             uint32_t(1) << (bool_constant_index % 32);
       }
-      // TODO(benvanik): break if (DoesControlFlowOpcodeEndShader(cf.opcode()))?
     }
   }
   ucode_disassembly_ = ucode_disasm_buffer.to_string();

From ed64e3072bde74955c79b9f3b0030c133648d702 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Fri, 5 May 2023 21:38:45 +0300
Subject: [PATCH 03/14] [GPU] Remove implicit bool cast in memexport checks

---
 src/xenia/gpu/d3d12/d3d12_command_processor.cc   | 4 ++--
 src/xenia/gpu/vulkan/vulkan_command_processor.cc | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 90427f5f7..814a74a7c 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -2125,7 +2125,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     return false;
   }
   pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
-  bool memexport_used_vertex = vertex_shader->memexport_eM_written();
+  bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0;
 
   // Pixel shader analysis.
   bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
@@ -2154,7 +2154,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     }
   }
   bool memexport_used_pixel =
-      pixel_shader && pixel_shader->memexport_eM_written();
+      pixel_shader && (pixel_shader->memexport_eM_written() != 0);
   bool memexport_used = memexport_used_vertex || memexport_used_pixel;
 
   if (!BeginSubmission(true)) {
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 7136505b2..7115929f4 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -2175,7 +2175,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
     return false;
   }
   pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
-  bool memexport_used_vertex = vertex_shader->memexport_eM_written();
+  bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0;
 
   // Pixel shader analysis.
   bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);

From 93b77fb775c46dc56c8612cb9f4ac3fdc7cc973b Mon Sep 17 00:00:00 2001
From: Wunkolo <Wunkolo@gmail.com>
Date: Sun, 5 Feb 2023 17:00:43 -0800
Subject: [PATCH 04/14] [PPC] Implement `vaddcuw`

I don't know of any title that utilizes this instruction, but I went
ahead and implemented it for completeness.

Verified the implementation with `instr__gen_vaddcuw` from #1348. Can be
grabbed with:
```
git checkout origin/gen_tests -- src\xenia\cpu\ppc\testing\*vaddcuw.s
```
---
 src/xenia/cpu/ppc/ppc_emit_altivec.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
index 3ca5bc40f..b8257c1bc 100644
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@@ -358,8 +358,13 @@ int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
 }
 
 int InstrEmit_vaddcuw(PPCHIRBuilder& f, const InstrData& i) {
-  XEINSTRNOTIMPLEMENTED();
-  return 1;
+  Value* sum = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE,
+                           ARITHMETIC_UNSIGNED);
+  Value* overflow = f.VectorCompareUGT(f.LoadVR(i.VX.VA), sum, INT32_TYPE);
+  Value* carry =
+      f.VectorShr(overflow, f.LoadConstantVec128(vec128i(31)), INT32_TYPE);
+  f.StoreVR(i.VX.VD, carry);
+  return 0;
 }
 
 int InstrEmit_vaddfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb) {

From 121bf93cbe2c23fe0a47f1f43e3f050df46e976d Mon Sep 17 00:00:00 2001
From: Wunkolo <Wunkolo@gmail.com>
Date: Mon, 17 Apr 2023 10:33:37 -0700
Subject: [PATCH 05/14] [PPC] Implement `vsubcuw`

Other half of #2125. I don't know of any title that utilizes this instruction, but I went ahead and implemented it for completeness.

Verified the implementation with `instr__gen_vsubcuw` from #1348. Can be grabbed with:
```
git checkout origin/gen_tests -- src\xenia\cpu\ppc\testing\*vsubcuw.s
```
---
 src/xenia/cpu/ppc/ppc_emit_altivec.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
index b8257c1bc..437b3f09b 100644
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@@ -1662,7 +1662,11 @@ int InstrEmit_vsrw128(PPCHIRBuilder& f, const InstrData& i) {
 }
 
 int InstrEmit_vsubcuw(PPCHIRBuilder& f, const InstrData& i) {
-  XEINSTRNOTIMPLEMENTED();
+  Value* underflow =
+      f.VectorCompareUGE(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE);
+  Value* borrow =
+      f.VectorShr(underflow, f.LoadConstantVec128(vec128i(31)), INT32_TYPE);
+  f.StoreVR(i.VX.VD, borrow);
   return 1;
 }
 

From 6ee2e3718f81748f3f2c6cae7287dffa7f87b14e Mon Sep 17 00:00:00 2001
From: Wunkolo <Wunkolo@gmail.com>
Date: Sun, 5 Feb 2023 17:55:09 -0800
Subject: [PATCH 06/14] [x64] Add AVX512 optimizations for
 `OPCODE_VECTOR_COMPARE_UGT`(Integer)

AVX512 has native unsigned integer comparisons instructions, removing
the need to XOR the most-significant-bit with a constant in memory to
use the signed comparison instructions. These instructions only write to
a k-mask register though and need an additional call to `vpmovm2*` to
turn the mask-register into a vector-mask register.

As of Icelake:
`vpcmpu*` is all L3/T1
`vpmovm2d` is L1/T0.33
`vpmovm2{b,w}` is L3/T0.33

As of Zen4:
`vpcmpu*` is all L3/T0.50
`vpmovm2*` is all L1/T0.25
---
 src/xenia/cpu/backend/x64/x64_seq_vector.cc | 37 +++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index 75f162559..205adf2a4 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -409,6 +409,43 @@ struct VECTOR_COMPARE_UGT_V128
     : Sequence<VECTOR_COMPARE_UGT_V128,
                I<OPCODE_VECTOR_COMPARE_UGT, V128Op, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW |
+                           kX64EmitAVX512DQ) &&
+        (i.instr->flags != FLOAT32_TYPE)) {
+      Xmm src1 = e.xmm0;
+      if (i.src1.is_constant) {
+        e.LoadConstantXmm(src1, i.src1.constant());
+      } else {
+        src1 = i.src1;
+      }
+
+      Xmm src2 = e.xmm1;
+      if (i.src2.is_constant) {
+        e.LoadConstantXmm(src2, i.src2.constant());
+      } else {
+        src2 = i.src2;
+      }
+
+      switch (i.instr->flags) {
+        case INT8_TYPE:
+          e.vpcmpub(e.k1, src1, src2, 0x6);
+          e.vpmovm2b(i.dest, e.k1);
+          break;
+        case INT16_TYPE:
+          e.vpcmpuw(e.k1, src1, src2, 0x6);
+          e.vpmovm2w(i.dest, e.k1);
+          break;
+        case INT32_TYPE:
+          e.vpcmpud(e.k1, src1, src2, 0x6);
+          e.vpmovm2d(i.dest, e.k1);
+          break;
+        default:
+          assert_always();
+          break;
+      }
+      return;
+    }
+
     Xbyak::Address sign_addr = e.ptr[e.rax];  // dummy
     switch (i.instr->flags) {
       case INT8_TYPE:

From e110527bfea4722d59da5a7603ea46b826a6740b Mon Sep 17 00:00:00 2001
From: Gliniak <Gliniak93@gmail.com>
Date: Wed, 31 May 2023 07:54:23 +0200
Subject: [PATCH 07/14] [Base] ListFiles: Prevent leakage of file descriptors

---
 src/xenia/base/filesystem_posix.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/xenia/base/filesystem_posix.cc b/src/xenia/base/filesystem_posix.cc
index 2e9ddb2c5..2fb75c5bf 100644
--- a/src/xenia/base/filesystem_posix.cc
+++ b/src/xenia/base/filesystem_posix.cc
@@ -234,7 +234,7 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
     }
     result.push_back(info);
   }
-
+  closedir(dir);
   return result;
 }
 

From 858af5ae756027f60f8219052bdf0e1af1979aa8 Mon Sep 17 00:00:00 2001
From: Gliniak <Gliniak93@gmail.com>
Date: Mon, 20 Feb 2023 13:33:50 +0100
Subject: [PATCH 08/14] [XAM] xeXamContentCreate - Disposition cleanup

---
 src/xenia/kernel/xam/xam_content.cc | 38 ++++++++++++-----------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/src/xenia/kernel/xam/xam_content.cc b/src/xenia/kernel/xam/xam_content.cc
index 11a9cf52b..0fe3cc5bb 100644
--- a/src/xenia/kernel/xam/xam_content.cc
+++ b/src/xenia/kernel/xam/xam_content.cc
@@ -116,6 +116,8 @@ dword_result_t XamContentCreateEnumerator_entry(
 }
 DECLARE_XAM_EXPORT1(XamContentCreateEnumerator, kContent, kImplemented);
 
+enum class kDispositionState : uint32_t { Unknown = 0, Create = 1, Open = 2 };
+
 dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
                                   lpvoid_t content_data_ptr,
                                   dword_t content_data_size, dword_t flags,
@@ -143,40 +145,37 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
               content_data, disposition_ptr, license_mask_ptr](
                  uint32_t& extended_error, uint32_t& length) -> X_RESULT {
     X_RESULT result = X_ERROR_INVALID_PARAMETER;
-    bool create = false;
-    bool open = false;
+    kDispositionState disposition = kDispositionState::Unknown;
     switch (flags & 0xF) {
       case 1:  // CREATE_NEW
                // Fail if exists.
         if (content_manager->ContentExists(content_data)) {
           result = X_ERROR_ALREADY_EXISTS;
         } else {
-          create = true;
+          disposition = kDispositionState::Create;
         }
         break;
       case 2:  // CREATE_ALWAYS
                // Overwrite existing, if any.
         if (content_manager->ContentExists(content_data)) {
           content_manager->DeleteContent(content_data);
-          create = true;
-        } else {
-          create = true;
         }
+        disposition = kDispositionState::Create;
         break;
       case 3:  // OPEN_EXISTING
                // Open only if exists.
         if (!content_manager->ContentExists(content_data)) {
           result = X_ERROR_PATH_NOT_FOUND;
         } else {
-          open = true;
+          disposition = kDispositionState::Open;
         }
         break;
       case 4:  // OPEN_ALWAYS
                // Create if needed.
         if (!content_manager->ContentExists(content_data)) {
-          create = true;
+          disposition = kDispositionState::Create;
         } else {
-          open = true;
+          disposition = kDispositionState::Open;
         }
         break;
       case 5:  // TRUNCATE_EXISTING
@@ -185,7 +184,7 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
           result = X_ERROR_PATH_NOT_FOUND;
         } else {
           content_manager->DeleteContent(content_data);
-          create = true;
+          disposition = kDispositionState::Create;
         }
         break;
       default:
@@ -193,19 +192,14 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
         break;
     }
 
-    // creation result
-    // 0 = ?
-    // 1 = created
-    // 2 = opened
-    uint32_t disposition = create ? 1 : 2;
-    if (disposition_ptr) {
-      *disposition_ptr = disposition;
+    if (disposition == kDispositionState::Create) {
+      result = content_manager->CreateContent(root_name, content_data);
+    } else if (disposition == kDispositionState::Open) {
+      result = content_manager->OpenContent(root_name, content_data);
     }
 
-    if (create) {
-      result = content_manager->CreateContent(root_name, content_data);
-    } else if (open) {
-      result = content_manager->OpenContent(root_name, content_data);
+    if (disposition_ptr) {
+      *disposition_ptr = static_cast<uint32_t>(disposition);
     }
 
     if (license_mask_ptr && XSUCCEEDED(result)) {
@@ -213,7 +207,7 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
     }
 
     extended_error = X_HRESULT_FROM_WIN32(result);
-    length = disposition;
+    length = static_cast<uint32_t>(disposition);
     return result;
   };
 

From 4a3b04d4ee90f31f08e7aa7ab30228c3eebe1d85 Mon Sep 17 00:00:00 2001
From: Adrian <78108584+AdrianCassar@users.noreply.github.com>
Date: Sun, 12 Feb 2023 21:42:11 +0000
Subject: [PATCH 09/14] [XAM] Implemented XamGetCurrentTitleId

---
 src/xenia/kernel/xam/xam_info.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/xenia/kernel/xam/xam_info.cc b/src/xenia/kernel/xam/xam_info.cc
index 0bfe7c97b..772600dc7 100644
--- a/src/xenia/kernel/xam/xam_info.cc
+++ b/src/xenia/kernel/xam/xam_info.cc
@@ -222,6 +222,11 @@ dword_result_t XGetLanguage_entry() {
 }
 DECLARE_XAM_EXPORT1(XGetLanguage, kNone, kImplemented);
 
+dword_result_t XamGetCurrentTitleId_entry() {
+  return kernel_state()->emulator()->title_id();
+}
+DECLARE_XAM_EXPORT1(XamGetCurrentTitleId, kNone, kImplemented);
+
 dword_result_t XamGetExecutionId_entry(lpdword_t info_ptr) {
   auto module = kernel_state()->GetExecutableModule();
   assert_not_null(module);

From 41c423109f9d40550c689952779a988538b4fca6 Mon Sep 17 00:00:00 2001
From: Roy Stewart <roythomasstewart@gmail.com>
Date: Fri, 3 Feb 2023 02:40:42 -0500
Subject: [PATCH 10/14] [Base] Set the path for posix file info

---
 src/xenia/base/filesystem_posix.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/xenia/base/filesystem_posix.cc b/src/xenia/base/filesystem_posix.cc
index 2fb75c5bf..85a727f1e 100644
--- a/src/xenia/base/filesystem_posix.cc
+++ b/src/xenia/base/filesystem_posix.cc
@@ -225,6 +225,7 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
     info.create_timestamp = convertUnixtimeToWinFiletime(st.st_ctime);
     info.access_timestamp = convertUnixtimeToWinFiletime(st.st_atime);
     info.write_timestamp = convertUnixtimeToWinFiletime(st.st_mtime);
+    info.path = path;
     if (ent->d_type == DT_DIR) {
       info.type = FileInfo::Type::kDirectory;
       info.total_size = 0;

From 07e81fe1727df1516d613bbaba209c35ba5f9f00 Mon Sep 17 00:00:00 2001
From: Roy Stewart <roythomasstewart@gmail.com>
Date: Fri, 3 Feb 2023 02:43:51 -0500
Subject: [PATCH 11/14] [Base] Filter out relative directories on linux

---
 src/xenia/base/filesystem_posix.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/xenia/base/filesystem_posix.cc b/src/xenia/base/filesystem_posix.cc
index 85a727f1e..193e637ea 100644
--- a/src/xenia/base/filesystem_posix.cc
+++ b/src/xenia/base/filesystem_posix.cc
@@ -217,6 +217,10 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
   }
 
   while (auto ent = readdir(dir)) {
+    if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
+      continue;
+    }
+
     FileInfo info;
 
     info.name = ent->d_name;

From 00aba94b98b1369d604f233fcd6a9dbbc233f3bb Mon Sep 17 00:00:00 2001
From: Gliniak <Gliniak93@gmail.com>
Date: Thu, 5 Jan 2023 21:02:55 +0100
Subject: [PATCH 12/14] [NET] NetDll___WSAFDIsSet: Fixed incorrect endianness
 of fd_count

Plus: limit it to 64 entries
Thanks to Bo98 for pointing that out
---
 src/xenia/kernel/xam/xam_net.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/xenia/kernel/xam/xam_net.cc b/src/xenia/kernel/xam/xam_net.cc
index a331f139d..c208198cd 100644
--- a/src/xenia/kernel/xam/xam_net.cc
+++ b/src/xenia/kernel/xam/xam_net.cc
@@ -1017,7 +1017,9 @@ DECLARE_XAM_EXPORT1(NetDll_sendto, kNetworking, kImplemented);
 
 dword_result_t NetDll___WSAFDIsSet_entry(dword_t socket_handle,
                                          pointer_t<x_fd_set> fd_set) {
-  for (uint32_t i = 0; i < fd_set->fd_count.value; i++) {
+  const uint8_t max_fd_count =
+      std::min((uint32_t)fd_set->fd_count, uint32_t(64));
+  for (uint8_t i = 0; i < max_fd_count; i++) {
     if (fd_set->fd_array[i] == socket_handle) {
       return 1;
     }

From 1887ea0795ab050ad8f2e2ef3e1918b0f342d3b4 Mon Sep 17 00:00:00 2001
From: Adriano Martins <iansixx66@gmail.com>
Date: Thu, 29 Jun 2023 19:45:35 -0300
Subject: [PATCH 13/14] [Base] Add missing #include <cstdint> to utf8.cc

---
 src/xenia/base/utf8.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/xenia/base/utf8.cc b/src/xenia/base/utf8.cc
index 6405aa2f8..eb7ec85b1 100644
--- a/src/xenia/base/utf8.cc
+++ b/src/xenia/base/utf8.cc
@@ -10,6 +10,7 @@
 #include "xenia/base/utf8.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <locale>
 #include <numeric>
 #include <tuple>

From c5e6352c349ca65b7119bab08d19797e95eb1509 Mon Sep 17 00:00:00 2001
From: Gliniak <Gliniak93@gmail.com>
Date: Mon, 7 Mar 2022 08:17:51 +0100
Subject: [PATCH 14/14] [CPU] Added constant propagation pass for:
 OPCODE_AND_NOT

---
 src/xenia/cpu/backend/x64/x64_sequences.cc    | 36 ++++++++-----------
 .../passes/constant_propagation_pass.cc       |  9 +++++
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index d8da70122..dabd65b2b 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -2697,34 +2697,28 @@ EMITTER_OPCODE_TABLE(OPCODE_AND, AND_I8, AND_I16, AND_I32, AND_I64, AND_V128);
 template <typename SEQ, typename REG, typename ARGS>
 void EmitAndNotXX(X64Emitter& e, const ARGS& i) {
   if (i.src1.is_constant) {
-    if (i.src2.is_constant) {
-      // Both constants.
-      e.mov(i.dest, i.src1.constant() & ~i.src2.constant());
-    } else {
-      // src1 constant.
+    // src1 constant.
+    // `and` instruction only supports up to 32-bit immediate constants
+    // 64-bit constants will need a temp register
+    if (i.dest.reg().getBit() == 64) {
+      auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
+      e.mov(temp, i.src1.constant());
 
-      // `and` instruction only supports up to 32-bit immediate constants
-      // 64-bit constants will need a temp register
-      if (i.dest.reg().getBit() == 64) {
-        auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
-        e.mov(temp, i.src1.constant());
-
-        if (e.IsFeatureEnabled(kX64EmitBMI1)) {
-          if (i.dest.reg().getBit() == 64) {
-            e.andn(i.dest.reg().cvt64(), i.src2.reg().cvt64(), temp.cvt64());
-          } else {
-            e.andn(i.dest.reg().cvt32(), i.src2.reg().cvt32(), temp.cvt32());
-          }
+      if (e.IsFeatureEnabled(kX64EmitBMI1)) {
+        if (i.dest.reg().getBit() == 64) {
+          e.andn(i.dest.reg().cvt64(), i.src2.reg().cvt64(), temp.cvt64());
         } else {
-          e.mov(i.dest, i.src2);
-          e.not_(i.dest);
-          e.and_(i.dest, temp);
+          e.andn(i.dest.reg().cvt32(), i.src2.reg().cvt32(), temp.cvt32());
         }
       } else {
         e.mov(i.dest, i.src2);
         e.not_(i.dest);
-        e.and_(i.dest, uint32_t(i.src1.constant()));
+        e.and_(i.dest, temp);
       }
+    } else {
+      e.mov(i.dest, i.src2);
+      e.not_(i.dest);
+      e.and_(i.dest, uint32_t(i.src1.constant()));
     }
   } else if (i.src2.is_constant) {
     // src2 constant.
diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
index b6b0376fa..77c3f21b6 100644
--- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
+++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc
@@ -648,6 +648,15 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
             result = true;
           }
           break;
+        case OPCODE_AND_NOT:
+          if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
+            v->set_from(i->src2.value);
+            v->Not();
+            v->And(i->src1.value);
+            i->Remove();
+            result = true;
+          }
+          break;
         case OPCODE_OR:
           if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
             v->set_from(i->src1.value);