diff --git a/src/xenia/gpu/render_target_cache.h b/src/xenia/gpu/render_target_cache.h
index f0e59fb5f..2bac528bd 100644
--- a/src/xenia/gpu/render_target_cache.h
+++ b/src/xenia/gpu/render_target_cache.h
@@ -302,6 +302,10 @@ class RenderTargetCache {
       }
       return xenos::IsColorRenderTargetFormat64bpp(GetColorFormat());
     }
+    const char* GetFormatName() const {
+      return is_depth ? xenos::GetDepthRenderTargetFormatName(GetDepthFormat())
+                      : xenos::GetColorRenderTargetFormatName(GetColorFormat());
+    }
 
     uint32_t GetPitchTiles() const {
       return pitch_tiles_at_32bpp << uint32_t(Is64bpp());
@@ -317,11 +321,9 @@ class RenderTargetCache {
     }
 
     std::string GetDebugName() const {
-      return fmt::format(
-          "RT @ {}t, <{}t>, {}xMSAA, {}", base_tiles, GetPitchTiles(),
-          uint32_t(1) << uint32_t(msaa_samples),
-          is_depth ? xenos::GetDepthRenderTargetFormatName(GetDepthFormat())
-                   : xenos::GetColorRenderTargetFormatName(GetColorFormat()));
+      return fmt::format("RT @ {}t, <{}t>, {}xMSAA, {}", base_tiles,
+                         GetPitchTiles(), uint32_t(1) << uint32_t(msaa_samples),
+                         GetFormatName());
     }
   };
 
diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index ce940da49..bcd140445 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -113,11 +113,9 @@ uint32_t SpirvShaderTranslator::GetModificationRegisterCount() const {
 }
 
 void SpirvShaderTranslator::StartTranslation() {
-  // Tool ID 26 "Xenia Emulator Microcode Translator".
-  // https://github.com/KhronosGroup/SPIRV-Headers/blob/c43a43c7cc3af55910b9bec2a71e3e8a622443cf/include/spirv/spir-v.xml#L79
   // TODO(Triang3l): Logger.
-  builder_ = std::make_unique<spv::Builder>(features_.spirv_version,
-                                            (26 << 16) | 1, nullptr);
+  builder_ = std::make_unique<spv::Builder>(
+      features_.spirv_version, (kSpirvMagicToolId << 16) | 1, nullptr);
 
   builder_->addCapability(IsSpirvTessEvalShader() ? spv::CapabilityTessellation
                                                   : spv::CapabilityShader);
@@ -1535,20 +1533,20 @@ spv::Id SpirvShaderTranslator::GetUnmodifiedOperandComponents(
         static_cast<unsigned int>(original_operand.GetComponent(scalar_index)) -
             static_cast<unsigned int>(SwizzleSource::kX));
   }
-  id_vector_temp_util_.clear();
-  id_vector_temp_util_.reserve(component_count);
+  uint_vector_temp_util_.clear();
+  uint_vector_temp_util_.reserve(component_count);
   uint32_t components_remaining = components;
   uint32_t component_index;
   while (xe::bit_scan_forward(components_remaining, &component_index)) {
     components_remaining &= ~(uint32_t(1) << component_index);
-    id_vector_temp_util_.push_back(
+    uint_vector_temp_util_.push_back(
         static_cast<unsigned int>(
             original_operand.GetComponent(component_index)) -
         static_cast<unsigned int>(SwizzleSource::kX));
   }
   return builder_->createRvalueSwizzle(spv::NoPrecision,
                                        type_float_vectors_[component_count - 1],
-                                       operand_storage, id_vector_temp_util_);
+                                       operand_storage, uint_vector_temp_util_);
 }
 
 void SpirvShaderTranslator::GetOperandScalarXY(
diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index 932bd608f..beb478bb6 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -138,6 +138,10 @@ class SpirvShaderTranslator : public ShaderTranslator {
     kDescriptorSetCount,
   };
 
+  // "Xenia Emulator Microcode Translator".
+  // https://github.com/KhronosGroup/SPIRV-Headers/blob/c43a43c7cc3af55910b9bec2a71e3e8a622443cf/include/spirv/spir-v.xml#L79
+  static constexpr uint32_t kSpirvMagicToolId = 26;
+
   struct Features {
     explicit Features(const ui::vulkan::VulkanProvider& provider);
     explicit Features(bool all = false);
@@ -172,6 +176,38 @@ class SpirvShaderTranslator : public ShaderTranslator {
         features_.max_storage_buffer_range);
   }
 
+  // Common functions useful not only for the translator, but also for EDRAM
+  // emulation via conventional render targets.
+
+  // Converts the color value externally clamped to [0, 31.875] to 7e3 floating
+  // point, with zeros in bits 10:31, rounding to the nearest even.
+  static spv::Id PreClampedFloat32To7e3(spv::Builder& builder,
+                                        spv::Id f32_scalar,
+                                        spv::Id ext_inst_glsl_std_450);
+  // Same as PreClampedFloat32To7e3, but clamps the input to [0, 31.875].
+  static spv::Id UnclampedFloat32To7e3(spv::Builder& builder,
+                                       spv::Id f32_scalar,
+                                       spv::Id ext_inst_glsl_std_450);
+  // Converts the 7e3 number in bits [f10_shift, f10_shift + 10) to a 32-bit
+  // float.
+  static spv::Id Float7e3To32(spv::Builder& builder, spv::Id f10_uint_scalar,
+                              uint32_t f10_shift, bool result_as_uint,
+                              spv::Id ext_inst_glsl_std_450);
+  // Converts the depth value externally clamped to the representable [0, 2)
+  // range to 20e4 floating point, with zeros in bits 24:31, rounding to the
+  // nearest even. If remap_from_0_to_0_5 is true, it's assumed that 0...1 is
+  // pre-remapped to 0...0.5 in the input.
+  static spv::Id PreClampedDepthTo20e4(spv::Builder& builder,
+                                       spv::Id f32_scalar,
+                                       bool remap_from_0_to_0_5,
+                                       spv::Id ext_inst_glsl_std_450);
+  // Converts the 20e4 number in bits [f24_shift, f24_shift + 10) to a 32-bit
+  // float.
+  static spv::Id Depth20e4To32(spv::Builder& builder, spv::Id f24_uint_scalar,
+                               uint32_t f24_shift, bool remap_to_0_to_0_5,
+                               bool result_as_uint,
+                               spv::Id ext_inst_glsl_std_450);
+
  protected:
   void Reset() override;
 
diff --git a/src/xenia/gpu/spirv_shader_translator_rb.cc b/src/xenia/gpu/spirv_shader_translator_rb.cc
new file mode 100644
index 000000000..4cb260bdd
--- /dev/null
+++ b/src/xenia/gpu/spirv_shader_translator_rb.cc
@@ -0,0 +1,425 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/gpu/spirv_shader_translator.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/glslang/SPIRV/GLSL.std.450.h"
+#include "xenia/base/assert.h"
+
+namespace xe {
+namespace gpu {
+
+spv::Id SpirvShaderTranslator::PreClampedFloat32To7e3(
+    spv::Builder& builder, spv::Id f32_scalar, spv::Id ext_inst_glsl_std_450) {
+  // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
+  // Assuming the value is already clamped to [0, 31.875].
+
+  spv::Id type_uint = builder.makeUintType(32);
+
+  // Need the source as uint for bit operations.
+  {
+    spv::Id source_type = builder.getTypeId(f32_scalar);
+    assert_true(builder.isScalarType(source_type));
+    if (!builder.isUintType(source_type)) {
+      f32_scalar = builder.createUnaryOp(spv::OpBitcast, type_uint, f32_scalar);
+    }
+  }
+
+  // The denormal 7e3 case.
+  // denormal_biased_f32 = (f32 & 0x7FFFFF) | 0x800000
+  spv::Id denormal_biased_f32;
+  {
+    spv::Instruction* denormal_insert_instruction = new spv::Instruction(
+        builder.getUniqueId(), type_uint, spv::OpBitFieldInsert);
+    denormal_insert_instruction->addIdOperand(f32_scalar);
+    denormal_insert_instruction->addIdOperand(builder.makeUintConstant(1));
+    denormal_insert_instruction->addIdOperand(builder.makeUintConstant(23));
+    denormal_insert_instruction->addIdOperand(builder.makeUintConstant(9));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(denormal_insert_instruction));
+    denormal_biased_f32 = denormal_insert_instruction->getResultId();
+  }
+  // denormal_biased_f32_shift_amount = min(125 - (f32 >> 23), 24)
+  // Not allowing the shift to overflow as that's undefined in SPIR-V.
+  spv::Id denormal_biased_f32_shift_amount;
+  {
+    spv::Instruction* denormal_shift_amount_instruction =
+        new spv::Instruction(builder.getUniqueId(), type_uint, spv::OpExtInst);
+    denormal_shift_amount_instruction->addIdOperand(ext_inst_glsl_std_450);
+    denormal_shift_amount_instruction->addImmediateOperand(GLSLstd450UMin);
+    denormal_shift_amount_instruction->addIdOperand(builder.createBinOp(
+        spv::OpISub, type_uint, builder.makeUintConstant(125),
+        builder.createBinOp(spv::OpShiftRightLogical, type_uint, f32_scalar,
+                            builder.makeUintConstant(23))));
+    denormal_shift_amount_instruction->addIdOperand(
+        builder.makeUintConstant(24));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(denormal_shift_amount_instruction));
+    denormal_biased_f32_shift_amount =
+        denormal_shift_amount_instruction->getResultId();
+  }
+  // denormal_biased_f32 =
+  //     ((f32 & 0x7FFFFF) | 0x800000) >> min(125 - (f32 >> 23), 24)
+  denormal_biased_f32 = builder.createBinOp(spv::OpShiftRightLogical, type_uint,
+                                            denormal_biased_f32,
+                                            denormal_biased_f32_shift_amount);
+
+  // The normal 7e3 case.
+  // Bias the exponent.
+  // normal_biased_f32 = f32 - (124 << 23)
+  spv::Id normal_biased_f32 =
+      builder.createBinOp(spv::OpISub, type_uint, f32_scalar,
+                          builder.makeUintConstant(UINT32_C(124) << 23));
+
+  // Select the needed conversion depending on whether the number is too small
+  // to be represented as normalized 7e3.
+  spv::Id biased_f32 = builder.createTriOp(
+      spv::OpSelect, type_uint,
+      builder.createBinOp(spv::OpULessThan, builder.makeBoolType(), f32_scalar,
+                          builder.makeUintConstant(0x3E800000)),
+      denormal_biased_f32, normal_biased_f32);
+
+  // Build the 7e3 number rounding to the nearest even.
+  // ((biased_f32 + 0x7FFF + ((biased_f32 >> 16) & 1)) >> 16) & 0x3FF
+  return builder.createTriOp(
+      spv::OpBitFieldUExtract, type_uint,
+      builder.createBinOp(
+          spv::OpIAdd, type_uint,
+          builder.createBinOp(spv::OpIAdd, type_uint, biased_f32,
+                              builder.makeUintConstant(0x7FFF)),
+          builder.createTriOp(spv::OpBitFieldUExtract, type_uint, biased_f32,
+                              builder.makeUintConstant(16),
+                              builder.makeUintConstant(1))),
+      builder.makeUintConstant(16), builder.makeUintConstant(10));
+}
+
+spv::Id SpirvShaderTranslator::UnclampedFloat32To7e3(
+    spv::Builder& builder, spv::Id f32_scalar, spv::Id ext_inst_glsl_std_450) {
+  spv::Id type_float = builder.makeFloatType(32);
+
+  // Need the source as float for clamping.
+  {
+    spv::Id source_type = builder.getTypeId(f32_scalar);
+    assert_true(builder.isScalarType(source_type));
+    if (!builder.isFloatType(source_type)) {
+      f32_scalar =
+          builder.createUnaryOp(spv::OpBitcast, type_float, f32_scalar);
+    }
+  }
+
+  {
+    spv::Instruction* clamp_instruction =
+        new spv::Instruction(builder.getUniqueId(), type_float, spv::OpExtInst);
+    clamp_instruction->addIdOperand(ext_inst_glsl_std_450);
+    clamp_instruction->addImmediateOperand(GLSLstd450NClamp);
+    clamp_instruction->addIdOperand(f32_scalar);
+    clamp_instruction->addIdOperand(builder.makeFloatConstant(0.0f));
+    clamp_instruction->addIdOperand(builder.makeFloatConstant(31.875f));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(clamp_instruction));
+    f32_scalar = clamp_instruction->getResultId();
+  }
+
+  return PreClampedFloat32To7e3(builder, f32_scalar, ext_inst_glsl_std_450);
+}
+
+spv::Id SpirvShaderTranslator::Float7e3To32(spv::Builder& builder,
+                                            spv::Id f10_uint_scalar,
+                                            uint32_t f10_shift,
+                                            bool result_as_uint,
+                                            spv::Id ext_inst_glsl_std_450) {
+  // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
+
+  assert_true(builder.isUintType(builder.getTypeId(f10_uint_scalar)));
+  assert_true(f10_shift <= (32 - 10));
+
+  spv::Id type_bool = builder.makeBoolType();
+  spv::Id type_int = builder.makeIntType(32);
+  spv::Id type_uint = builder.makeUintType(32);
+
+  spv::Id f10_unbiased_exponent = builder.createTriOp(
+      spv::OpBitFieldUExtract, type_uint, f10_uint_scalar,
+      builder.makeUintConstant(f10_shift + 7), builder.makeUintConstant(3));
+  spv::Id f10_mantissa = builder.createTriOp(
+      spv::OpBitFieldUExtract, type_uint, f10_uint_scalar,
+      builder.makeUintConstant(f10_shift), builder.makeUintConstant(7));
+
+  // The denormal nonzero 7e3 case.
+  // denormal_mantissa_msb = findMSB(f10_mantissa)
+  spv::Id denormal_mantissa_msb;
+  {
+    spv::Instruction* denormal_mantissa_msb_instruction =
+        new spv::Instruction(builder.getUniqueId(), type_int, spv::OpExtInst);
+    denormal_mantissa_msb_instruction->addIdOperand(ext_inst_glsl_std_450);
+    denormal_mantissa_msb_instruction->addImmediateOperand(GLSLstd450FindUMsb);
+    denormal_mantissa_msb_instruction->addIdOperand(f10_mantissa);
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(denormal_mantissa_msb_instruction));
+    denormal_mantissa_msb = denormal_mantissa_msb_instruction->getResultId();
+  }
+  denormal_mantissa_msb =
+      builder.createUnaryOp(spv::OpBitcast, type_uint, denormal_mantissa_msb);
+  // denormal_f32_unbiased_exponent = 1 - (7 - findMSB(f10_mantissa))
+  // Or:
+  // denormal_f32_unbiased_exponent = findMSB(f10_mantissa) - 6
+  spv::Id denormal_f32_unbiased_exponent =
+      builder.createBinOp(spv::OpISub, type_uint, denormal_mantissa_msb,
+                          builder.makeUintConstant(6));
+  // Normalize the mantissa.
+  // denormal_f32_mantissa = f10_mantissa << (7 - findMSB(f10_mantissa))
+  spv::Id denormal_f32_mantissa = builder.createBinOp(
+      spv::OpShiftLeftLogical, type_uint, f10_mantissa,
+      builder.createBinOp(spv::OpISub, type_uint, builder.makeUintConstant(7),
+                          denormal_mantissa_msb));
+  // If the 7e3 number is zero, make sure the float32 number is zero too.
+  spv::Id f10_mantissa_is_nonzero = builder.createBinOp(
+      spv::OpINotEqual, type_bool, f10_mantissa, builder.makeUintConstant(0));
+  // Set the unbiased exponent to -124 for zero - 124 will be added later,
+  // resulting in zero float32.
+  denormal_f32_unbiased_exponent = builder.createTriOp(
+      spv::OpSelect, type_uint, f10_mantissa_is_nonzero,
+      denormal_f32_unbiased_exponent, builder.makeUintConstant(uint32_t(-124)));
+  denormal_f32_mantissa =
+      builder.createTriOp(spv::OpSelect, type_uint, f10_mantissa_is_nonzero,
+                          denormal_f32_mantissa, builder.makeUintConstant(0));
+
+  // Select the needed conversion depending on whether the number is normal.
+  spv::Id f10_is_normal =
+      builder.createBinOp(spv::OpINotEqual, type_bool, f10_unbiased_exponent,
+                          builder.makeUintConstant(0));
+  spv::Id f32_unbiased_exponent = builder.createTriOp(
+      spv::OpSelect, type_uint, f10_is_normal, f10_unbiased_exponent,
+      denormal_f32_unbiased_exponent);
+  spv::Id f32_mantissa =
+      builder.createTriOp(spv::OpSelect, type_uint, f10_is_normal, f10_mantissa,
+                          denormal_f32_mantissa);
+
+  // Bias the exponent and construct the build the float32 number.
+  spv::Id f32_shifted;
+  {
+    spv::Instruction* f32_insert_instruction = new spv::Instruction(
+        builder.getUniqueId(), type_uint, spv::OpBitFieldInsert);
+    f32_insert_instruction->addIdOperand(f32_mantissa);
+    f32_insert_instruction->addIdOperand(
+        builder.createBinOp(spv::OpIAdd, type_uint, f32_unbiased_exponent,
+                            builder.makeUintConstant(124)));
+    f32_insert_instruction->addIdOperand(builder.makeUintConstant(7));
+    f32_insert_instruction->addIdOperand(builder.makeUintConstant(8));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(f32_insert_instruction));
+    f32_shifted = f32_insert_instruction->getResultId();
+  }
+  spv::Id f32 =
+      builder.createBinOp(spv::OpShiftLeftLogical, type_uint, f32_shifted,
+                          builder.makeUintConstant(23 - 7));
+
+  if (!result_as_uint) {
+    f32 = builder.createUnaryOp(spv::OpBitcast, builder.makeFloatType(32), f32);
+  }
+
+  return f32;
+}
+
+spv::Id SpirvShaderTranslator::PreClampedDepthTo20e4(
+    spv::Builder& builder, spv::Id f32_scalar, bool remap_from_0_to_0_5,
+    spv::Id ext_inst_glsl_std_450) {
+  // CFloat24 from d3dref9.dll +
+  // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
+  // Assuming the value is already clamped to [0, 2) (in all places, the depth
+  // is written with saturation).
+
+  uint32_t remap_bias = uint32_t(remap_from_0_to_0_5);
+
+  spv::Id type_uint = builder.makeUintType(32);
+
+  // Need the source as uint for bit operations.
+  {
+    spv::Id source_type = builder.getTypeId(f32_scalar);
+    assert_true(builder.isScalarType(source_type));
+    if (!builder.isUintType(source_type)) {
+      f32_scalar = builder.createUnaryOp(spv::OpBitcast, type_uint, f32_scalar);
+    }
+  }
+
+  // The denormal 20e4 case.
+  // denormal_biased_f32 = (f32 & 0x7FFFFF) | 0x800000
+  spv::Id denormal_biased_f32;
+  {
+    spv::Instruction* denormal_insert_instruction = new spv::Instruction(
+        builder.getUniqueId(), type_uint, spv::OpBitFieldInsert);
+    denormal_insert_instruction->addIdOperand(f32_scalar);
+    denormal_insert_instruction->addIdOperand(builder.makeUintConstant(1));
+    denormal_insert_instruction->addIdOperand(builder.makeUintConstant(23));
+    denormal_insert_instruction->addIdOperand(builder.makeUintConstant(9));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(denormal_insert_instruction));
+    denormal_biased_f32 = denormal_insert_instruction->getResultId();
+  }
+  // denormal_biased_f32_shift_amount = min(113 - (f32 >> 23), 24)
+  // Not allowing the shift to overflow as that's undefined in SPIR-V.
+  spv::Id denormal_biased_f32_shift_amount;
+  {
+    spv::Instruction* denormal_shift_amount_instruction =
+        new spv::Instruction(builder.getUniqueId(), type_uint, spv::OpExtInst);
+    denormal_shift_amount_instruction->addIdOperand(ext_inst_glsl_std_450);
+    denormal_shift_amount_instruction->addImmediateOperand(GLSLstd450UMin);
+    denormal_shift_amount_instruction->addIdOperand(builder.createBinOp(
+        spv::OpISub, type_uint, builder.makeUintConstant(113 - remap_bias),
+        builder.createBinOp(spv::OpShiftRightLogical, type_uint, f32_scalar,
+                            builder.makeUintConstant(23))));
+    denormal_shift_amount_instruction->addIdOperand(
+        builder.makeUintConstant(24));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(denormal_shift_amount_instruction));
+    denormal_biased_f32_shift_amount =
+        denormal_shift_amount_instruction->getResultId();
+  }
+  // denormal_biased_f32 =
+  //     ((f32 & 0x7FFFFF) | 0x800000) >> min(113 - (f32 >> 23), 24)
+  denormal_biased_f32 = builder.createBinOp(spv::OpShiftRightLogical, type_uint,
+                                            denormal_biased_f32,
+                                            denormal_biased_f32_shift_amount);
+
+  // The normal 20e4 case.
+  // Bias the exponent.
+  // normal_biased_f32 = f32 - (112 << 23)
+  spv::Id normal_biased_f32 = builder.createBinOp(
+      spv::OpISub, type_uint, f32_scalar,
+      builder.makeUintConstant((UINT32_C(112) + remap_bias) << 23));
+
+  // Select the needed conversion depending on whether the number is too small
+  // to be represented as normalized 20e4.
+  spv::Id biased_f32 = builder.createTriOp(
+      spv::OpSelect, type_uint,
+      builder.createBinOp(
+          spv::OpULessThan, builder.makeBoolType(), f32_scalar,
+          builder.makeUintConstant(0x38800000 - (remap_bias << 23))),
+      denormal_biased_f32, normal_biased_f32);
+
+  // Build the 20e4 number rounding to the nearest even.
+  // ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF
+  return builder.createTriOp(
+      spv::OpBitFieldUExtract, type_uint,
+      builder.createBinOp(
+          spv::OpIAdd, type_uint,
+          builder.createBinOp(spv::OpIAdd, type_uint, biased_f32,
+                              builder.makeUintConstant(3)),
+          builder.createTriOp(spv::OpBitFieldUExtract, type_uint, biased_f32,
+                              builder.makeUintConstant(3),
+                              builder.makeUintConstant(1))),
+      builder.makeUintConstant(3), builder.makeUintConstant(24));
+}
+
+spv::Id SpirvShaderTranslator::Depth20e4To32(spv::Builder& builder,
+                                             spv::Id f24_uint_scalar,
+                                             uint32_t f24_shift,
+                                             bool remap_to_0_to_0_5,
+                                             bool result_as_uint,
+                                             spv::Id ext_inst_glsl_std_450) {
+  // CFloat24 from d3dref9.dll +
+  // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
+
+  assert_true(builder.isUintType(builder.getTypeId(f24_uint_scalar)));
+  assert_true(f24_shift <= (32 - 24));
+
+  uint32_t remap_bias = uint32_t(remap_to_0_to_0_5);
+
+  spv::Id type_bool = builder.makeBoolType();
+  spv::Id type_int = builder.makeIntType(32);
+  spv::Id type_uint = builder.makeUintType(32);
+
+  spv::Id f24_unbiased_exponent = builder.createTriOp(
+      spv::OpBitFieldUExtract, type_uint, f24_uint_scalar,
+      builder.makeUintConstant(f24_shift + 20), builder.makeUintConstant(4));
+  spv::Id f24_mantissa = builder.createTriOp(
+      spv::OpBitFieldUExtract, type_uint, f24_uint_scalar,
+      builder.makeUintConstant(f24_shift), builder.makeUintConstant(20));
+
+  // The denormal nonzero 20e4 case.
+  // denormal_mantissa_msb = findMSB(f24_mantissa)
+  spv::Id denormal_mantissa_msb;
+  {
+    spv::Instruction* denormal_mantissa_msb_instruction =
+        new spv::Instruction(builder.getUniqueId(), type_int, spv::OpExtInst);
+    denormal_mantissa_msb_instruction->addIdOperand(ext_inst_glsl_std_450);
+    denormal_mantissa_msb_instruction->addImmediateOperand(GLSLstd450FindUMsb);
+    denormal_mantissa_msb_instruction->addIdOperand(f24_mantissa);
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(denormal_mantissa_msb_instruction));
+    denormal_mantissa_msb = denormal_mantissa_msb_instruction->getResultId();
+  }
+  denormal_mantissa_msb =
+      builder.createUnaryOp(spv::OpBitcast, type_uint, denormal_mantissa_msb);
+  // denormal_f32_unbiased_exponent = 1 - (20 - findMSB(f24_mantissa))
+  // Or:
+  // denormal_f32_unbiased_exponent = findMSB(f24_mantissa) - 19
+  spv::Id denormal_f32_unbiased_exponent =
+      builder.createBinOp(spv::OpISub, type_uint, denormal_mantissa_msb,
+                          builder.makeUintConstant(19));
+  // Normalize the mantissa.
+  // denormal_f32_mantissa = f24_mantissa << (20 - findMSB(f24_mantissa))
+  spv::Id denormal_f32_mantissa = builder.createBinOp(
+      spv::OpShiftLeftLogical, type_uint, f24_mantissa,
+      builder.createBinOp(spv::OpISub, type_uint, builder.makeUintConstant(20),
+                          denormal_mantissa_msb));
+  // If the 20e4 number is zero, make sure the float32 number is zero too.
+  spv::Id f24_mantissa_is_nonzero = builder.createBinOp(
+      spv::OpINotEqual, type_bool, f24_mantissa, builder.makeUintConstant(0));
+  // Set the unbiased exponent to -112 for zero - 112 will be added later,
+  // resulting in zero float32.
+  denormal_f32_unbiased_exponent = builder.createTriOp(
+      spv::OpSelect, type_uint, f24_mantissa_is_nonzero,
+      denormal_f32_unbiased_exponent,
+      builder.makeUintConstant(uint32_t(-int32_t(112 - remap_bias))));
+  denormal_f32_mantissa =
+      builder.createTriOp(spv::OpSelect, type_uint, f24_mantissa_is_nonzero,
+                          denormal_f32_mantissa, builder.makeUintConstant(0));
+
+  // Select the needed conversion depending on whether the number is normal.
+  spv::Id f24_is_normal =
+      builder.createBinOp(spv::OpINotEqual, type_bool, f24_unbiased_exponent,
+                          builder.makeUintConstant(0));
+  spv::Id f32_unbiased_exponent = builder.createTriOp(
+      spv::OpSelect, type_uint, f24_is_normal, f24_unbiased_exponent,
+      denormal_f32_unbiased_exponent);
+  spv::Id f32_mantissa =
+      builder.createTriOp(spv::OpSelect, type_uint, f24_is_normal, f24_mantissa,
+                          denormal_f32_mantissa);
+
+  // Bias the exponent and construct the build the float32 number.
+  spv::Id f32_shifted;
+  {
+    spv::Instruction* f32_insert_instruction = new spv::Instruction(
+        builder.getUniqueId(), type_uint, spv::OpBitFieldInsert);
+    f32_insert_instruction->addIdOperand(f32_mantissa);
+    f32_insert_instruction->addIdOperand(
+        builder.createBinOp(spv::OpIAdd, type_uint, f32_unbiased_exponent,
+                            builder.makeUintConstant(112 - remap_bias)));
+    f32_insert_instruction->addIdOperand(builder.makeUintConstant(20));
+    f32_insert_instruction->addIdOperand(builder.makeUintConstant(8));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(f32_insert_instruction));
+    f32_shifted = f32_insert_instruction->getResultId();
+  }
+  spv::Id f32 =
+      builder.createBinOp(spv::OpShiftLeftLogical, type_uint, f32_shifted,
+                          builder.makeUintConstant(23 - 20));
+
+  if (!result_as_uint) {
+    f32 = builder.createUnaryOp(spv::OpBitcast, builder.makeFloatType(32), f32);
+  }
+
+  return f32;
+}
+
+}  // namespace gpu
+}  // namespace xe
diff --git a/src/xenia/gpu/vulkan/deferred_command_buffer.cc b/src/xenia/gpu/vulkan/deferred_command_buffer.cc
index 470d8adde..98d42865d 100644
--- a/src/xenia/gpu/vulkan/deferred_command_buffer.cc
+++ b/src/xenia/gpu/vulkan/deferred_command_buffer.cc
@@ -103,6 +103,37 @@ void DeferredCommandBuffer::Execute(VkCommandBuffer command_buffer) {
                               args.pipeline);
       } break;
 
+      case Command::kVkBindVertexBuffers: {
+        auto& args = *reinterpret_cast<const ArgsVkBindVertexBuffers*>(stream);
+        size_t offset_bytes =
+            xe::align(sizeof(ArgsVkBindVertexBuffers), alignof(VkBuffer));
+        const VkBuffer* buffers = reinterpret_cast<const VkBuffer*>(
+            reinterpret_cast<const uint8_t*>(stream) + offset_bytes);
+        offset_bytes =
+            xe::align(offset_bytes + sizeof(VkBuffer) * args.binding_count,
+                      alignof(VkDeviceSize));
+        const VkDeviceSize* offsets = reinterpret_cast<const VkDeviceSize*>(
+            reinterpret_cast<const uint8_t*>(stream) + offset_bytes);
+        dfn.vkCmdBindVertexBuffers(command_buffer, args.first_binding,
+                                   args.binding_count, buffers, offsets);
+      } break;
+
+      case Command::kVkClearAttachments: {
+        auto& args = *reinterpret_cast<const ArgsVkClearAttachments*>(stream);
+        size_t offset_bytes = xe::align(sizeof(ArgsVkClearAttachments),
+                                        alignof(VkClearAttachment));
+        const VkClearAttachment* attachments =
+            reinterpret_cast<const VkClearAttachment*>(
+                reinterpret_cast<const uint8_t*>(stream) + offset_bytes);
+        offset_bytes = xe::align(
+            offset_bytes + sizeof(VkClearAttachment) * args.attachment_count,
+            alignof(VkClearRect));
+        const VkClearRect* rects = reinterpret_cast<const VkClearRect*>(
+            reinterpret_cast<const uint8_t*>(stream) + offset_bytes);
+        dfn.vkCmdClearAttachments(command_buffer, args.attachment_count,
+                                  attachments, args.rect_count, rects);
+      } break;
+
       case Command::kVkCopyBuffer: {
         auto& args = *reinterpret_cast<const ArgsVkCopyBuffer*>(stream);
         dfn.vkCmdCopyBuffer(
@@ -112,6 +143,12 @@ void DeferredCommandBuffer::Execute(VkCommandBuffer command_buffer) {
                 xe::align(sizeof(ArgsVkCopyBuffer), alignof(VkBufferCopy))));
       } break;
 
+      case Command::kVkDispatch: {
+        auto& args = *reinterpret_cast<const ArgsVkDispatch*>(stream);
+        dfn.vkCmdDispatch(command_buffer, args.group_count_x,
+                          args.group_count_y, args.group_count_z);
+      } break;
+
       case Command::kVkDraw: {
         auto& args = *reinterpret_cast<const ArgsVkDraw*>(stream);
         dfn.vkCmdDraw(command_buffer, args.vertex_count, args.instance_count,
@@ -168,6 +205,14 @@ void DeferredCommandBuffer::Execute(VkCommandBuffer command_buffer) {
             args.image_memory_barrier_count, image_memory_barriers);
       } break;
 
+      case Command::kVkPushConstants: {
+        auto& args = *reinterpret_cast<const ArgsVkPushConstants*>(stream);
+        dfn.vkCmdPushConstants(command_buffer, args.layout, args.stage_flags,
+                               args.offset, args.size,
+                               reinterpret_cast<const uint8_t*>(stream) +
+                                   sizeof(ArgsVkPushConstants));
+      } break;
+
       case Command::kVkSetBlendConstants: {
         auto& args = *reinterpret_cast<const ArgsVkSetBlendConstants*>(stream);
         dfn.vkCmdSetBlendConstants(command_buffer, args.blend_constants);
diff --git a/src/xenia/gpu/vulkan/deferred_command_buffer.h b/src/xenia/gpu/vulkan/deferred_command_buffer.h
index ac4c88f85..e3605f1e6 100644
--- a/src/xenia/gpu/vulkan/deferred_command_buffer.h
+++ b/src/xenia/gpu/vulkan/deferred_command_buffer.h
@@ -108,6 +108,61 @@ class DeferredCommandBuffer {
     args.pipeline = pipeline;
   }
 
+  void CmdVkBindVertexBuffers(uint32_t first_binding, uint32_t binding_count,
+                              const VkBuffer* buffers,
+                              const VkDeviceSize* offsets) {
+    size_t arguments_size =
+        xe::align(sizeof(ArgsVkBindVertexBuffers), alignof(VkBuffer));
+    size_t buffers_offset = arguments_size;
+    arguments_size =
+        xe::align(arguments_size + sizeof(VkBuffer) * binding_count,
+                  alignof(VkDeviceSize));
+    size_t offsets_offset = arguments_size;
+    arguments_size += sizeof(VkDeviceSize) * binding_count;
+    uint8_t* args_ptr = reinterpret_cast<uint8_t*>(
+        WriteCommand(Command::kVkBindVertexBuffers, arguments_size));
+    auto& args = *reinterpret_cast<ArgsVkBindVertexBuffers*>(args_ptr);
+    args.first_binding = first_binding;
+    args.binding_count = binding_count;
+    std::memcpy(args_ptr + buffers_offset, buffers,
+                sizeof(VkBuffer) * binding_count);
+    std::memcpy(args_ptr + offsets_offset, offsets,
+                sizeof(VkDeviceSize) * binding_count);
+  }
+
+  void CmdClearAttachmentsEmplace(uint32_t attachment_count,
+                                  VkClearAttachment*& attachments_out,
+                                  uint32_t rect_count,
+                                  VkClearRect*& rects_out) {
+    size_t arguments_size =
+        xe::align(sizeof(ArgsVkClearAttachments), alignof(VkClearAttachment));
+    size_t attachments_offset = arguments_size;
+    arguments_size =
+        xe::align(arguments_size + sizeof(VkClearAttachment) * attachment_count,
+                  alignof(VkClearRect));
+    size_t rects_offset = arguments_size;
+    arguments_size += sizeof(VkClearRect) * rect_count;
+    uint8_t* args_ptr = reinterpret_cast<uint8_t*>(
+        WriteCommand(Command::kVkClearAttachments, arguments_size));
+    auto& args = *reinterpret_cast<ArgsVkClearAttachments*>(args_ptr);
+    args.attachment_count = attachment_count;
+    args.rect_count = rect_count;
+    attachments_out =
+        reinterpret_cast<VkClearAttachment*>(args_ptr + attachments_offset);
+    rects_out = reinterpret_cast<VkClearRect*>(args_ptr + rects_offset);
+  }
+  void CmdVkClearAttachments(uint32_t attachment_count,
+                             const VkClearAttachment* attachments,
+                             uint32_t rect_count, const VkClearRect* rects) {
+    VkClearAttachment* attachments_arg;
+    VkClearRect* rects_arg;
+    CmdClearAttachmentsEmplace(attachment_count, attachments_arg, rect_count,
+                               rects_arg);
+    std::memcpy(attachments_arg, attachments,
+                sizeof(VkClearAttachment) * attachment_count);
+    std::memcpy(rects_arg, rects, sizeof(VkClearRect) * rect_count);
+  }
+
   VkBufferCopy* CmdCopyBufferEmplace(VkBuffer src_buffer, VkBuffer dst_buffer,
                                      uint32_t region_count) {
     const size_t header_size =
@@ -127,6 +182,15 @@ class DeferredCommandBuffer {
                 regions, sizeof(VkBufferCopy) * region_count);
   }
 
+  void CmdVkDispatch(uint32_t group_count_x, uint32_t group_count_y,
+                     uint32_t group_count_z) {
+    auto& args = *reinterpret_cast<ArgsVkDispatch*>(
+        WriteCommand(Command::kVkDispatch, sizeof(ArgsVkDispatch)));
+    args.group_count_x = group_count_x;
+    args.group_count_y = group_count_y;
+    args.group_count_z = group_count_z;
+  }
+
   void CmdVkDraw(uint32_t vertex_count, uint32_t instance_count,
                  uint32_t first_vertex, uint32_t first_instance) {
     auto& args = *reinterpret_cast<ArgsVkDraw*>(
@@ -162,6 +226,19 @@ class DeferredCommandBuffer {
                             uint32_t image_memory_barrier_count,
                             const VkImageMemoryBarrier* image_memory_barriers);
 
+  void CmdVkPushConstants(VkPipelineLayout layout,
+                          VkShaderStageFlags stage_flags, uint32_t offset,
+                          uint32_t size, const void* values) {
+    uint8_t* args_ptr = reinterpret_cast<uint8_t*>(WriteCommand(
+        Command::kVkPushConstants, sizeof(ArgsVkPushConstants) + size));
+    auto& args = *reinterpret_cast<ArgsVkPushConstants*>(args_ptr);
+    args.layout = layout;
+    args.stage_flags = stage_flags;
+    args.offset = offset;
+    args.size = size;
+    std::memcpy(args_ptr + sizeof(ArgsVkPushConstants), values, size);
+  }
+
   void CmdVkSetBlendConstants(const float* blend_constants) {
     auto& args = *reinterpret_cast<ArgsVkSetBlendConstants*>(WriteCommand(
         Command::kVkSetBlendConstants, sizeof(ArgsVkSetBlendConstants)));
@@ -237,11 +314,15 @@ class DeferredCommandBuffer {
     kVkBindDescriptorSets,
     kVkBindIndexBuffer,
     kVkBindPipeline,
+    kVkBindVertexBuffers,
+    kVkClearAttachments,
     kVkCopyBuffer,
+    kVkDispatch,
     kVkDraw,
     kVkDrawIndexed,
     kVkEndRenderPass,
     kVkPipelineBarrier,
+    kVkPushConstants,
     kVkSetBlendConstants,
     kVkSetDepthBias,
     kVkSetScissor,
@@ -289,6 +370,22 @@ class DeferredCommandBuffer {
     VkPipeline pipeline;
   };
 
+  struct ArgsVkBindVertexBuffers {
+    uint32_t first_binding;
+    uint32_t binding_count;
+    // Followed by aligned VkBuffer[], VkDeviceSize[].
+    static_assert(alignof(VkBuffer) <= alignof(uintmax_t));
+    static_assert(alignof(VkDeviceSize) <= alignof(uintmax_t));
+  };
+
+  struct ArgsVkClearAttachments {
+    uint32_t attachment_count;
+    uint32_t rect_count;
+    // Followed by aligned VkClearAttachment[], VkClearRect[].
+    static_assert(alignof(VkClearAttachment) <= alignof(uintmax_t));
+    static_assert(alignof(VkClearRect) <= alignof(uintmax_t));
+  };
+
   struct ArgsVkCopyBuffer {
     VkBuffer src_buffer;
     VkBuffer dst_buffer;
@@ -297,6 +394,12 @@ class DeferredCommandBuffer {
     static_assert(alignof(VkBufferCopy) <= alignof(uintmax_t));
   };
 
+  struct ArgsVkDispatch {
+    uint32_t group_count_x;
+    uint32_t group_count_y;
+    uint32_t group_count_z;
+  };
+
   struct ArgsVkDraw {
     uint32_t vertex_count;
     uint32_t instance_count;
@@ -326,6 +429,14 @@ class DeferredCommandBuffer {
     static_assert(alignof(VkImageMemoryBarrier) <= alignof(uintmax_t));
   };
 
+  struct ArgsVkPushConstants {
+    VkPipelineLayout layout;
+    VkShaderStageFlags stage_flags;
+    uint32_t offset;
+    uint32_t size;
+    // Followed by `size` bytes of values.
+  };
+
   struct ArgsVkSetBlendConstants {
     float blend_constants[4];
   };
diff --git a/src/xenia/gpu/vulkan/premake5.lua b/src/xenia/gpu/vulkan/premake5.lua
index 44205f326..ffc359504 100644
--- a/src/xenia/gpu/vulkan/premake5.lua
+++ b/src/xenia/gpu/vulkan/premake5.lua
@@ -8,6 +8,7 @@ project("xenia-gpu-vulkan")
   language("C++")
   links({
     "fmt",
+    "glslang-spirv",
     "xenia-base",
     "xenia-gpu",
     "xenia-ui",
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 69d0c70a3..4f534c9dd 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -476,7 +476,7 @@ bool VulkanCommandProcessor::SetupContext() {
   swap_pipeline_create_info.renderPass = swap_render_pass_;
   swap_pipeline_create_info.subpass = 0;
   swap_pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE;
-  swap_pipeline_create_info.basePipelineIndex = UINT32_MAX;
+  swap_pipeline_create_info.basePipelineIndex = -1;
   VkResult swap_pipeline_create_result = dfn.vkCreateGraphicsPipelines(
       device, VK_NULL_HANDLE, 1, &swap_pipeline_create_info, nullptr,
       &swap_pipeline_);
@@ -810,8 +810,6 @@ void VulkanCommandProcessor::IssueSwap(uint32_t frontbuffer_ptr,
         deferred_command_buffer_.CmdVkBeginRenderPass(
             &render_pass_begin_info, VK_SUBPASS_CONTENTS_INLINE);
 
-        dynamic_viewport_update_needed_ = true;
-        dynamic_scissor_update_needed_ = true;
         VkViewport viewport;
         viewport.x = 0.0f;
         viewport.y = 0.0f;
@@ -819,13 +817,13 @@ void VulkanCommandProcessor::IssueSwap(uint32_t frontbuffer_ptr,
         viewport.height = float(scaled_height);
         viewport.minDepth = 0.0f;
         viewport.maxDepth = 1.0f;
-        deferred_command_buffer_.CmdVkSetViewport(0, 1, &viewport);
-        VkRect2D scissor_rect;
-        scissor_rect.offset.x = 0;
-        scissor_rect.offset.y = 0;
-        scissor_rect.extent.width = scaled_width;
-        scissor_rect.extent.height = scaled_height;
-        deferred_command_buffer_.CmdVkSetScissor(0, 1, &scissor_rect);
+        SetViewport(viewport);
+        VkRect2D scissor;
+        scissor.offset.x = 0;
+        scissor.offset.y = 0;
+        scissor.extent.width = scaled_width;
+        scissor.extent.height = scaled_height;
+        SetScissor(scissor);
 
         BindExternalGraphicsPipeline(swap_pipeline_);
 
@@ -856,7 +854,7 @@ void VulkanCommandProcessor::IssueSwap(uint32_t frontbuffer_ptr,
   EndSubmission(true);
 }
 
-void VulkanCommandProcessor::PushBufferMemoryBarrier(
+bool VulkanCommandProcessor::PushBufferMemoryBarrier(
     VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size,
     VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask,
     VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask,
@@ -865,7 +863,7 @@ void VulkanCommandProcessor::PushBufferMemoryBarrier(
   if (skip_if_equal && src_stage_mask == dst_stage_mask &&
       src_access_mask == dst_access_mask &&
       src_queue_family_index == dst_queue_family_index) {
-    return;
+    return false;
   }
 
   // Separate different barriers for overlapping buffer ranges into different
@@ -889,10 +887,10 @@ void VulkanCommandProcessor::PushBufferMemoryBarrier(
             src_queue_family_index &&
         other_buffer_memory_barrier.dstQueueFamilyIndex ==
             dst_queue_family_index) {
-      // The barrier is already present.
+      // The barrier is already pending.
       current_pending_barrier_.src_stage_mask |= src_stage_mask;
       current_pending_barrier_.dst_stage_mask |= dst_stage_mask;
-      return;
+      return true;
     }
     SplitPendingBarrier();
     break;
@@ -911,9 +909,10 @@ void VulkanCommandProcessor::PushBufferMemoryBarrier(
   buffer_memory_barrier.buffer = buffer;
   buffer_memory_barrier.offset = offset;
   buffer_memory_barrier.size = size;
+  return true;
 }
 
-void VulkanCommandProcessor::PushImageMemoryBarrier(
+bool VulkanCommandProcessor::PushImageMemoryBarrier(
     VkImage image, const VkImageSubresourceRange& subresource_range,
     VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask,
     VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask,
@@ -923,7 +922,7 @@ void VulkanCommandProcessor::PushImageMemoryBarrier(
   if (skip_if_equal && src_stage_mask == dst_stage_mask &&
       src_access_mask == dst_access_mask && old_layout == new_layout &&
       src_queue_family_index == dst_queue_family_index) {
-    return;
+    return false;
   }
 
   // Separate different barriers for overlapping image subresource ranges into
@@ -969,10 +968,10 @@ void VulkanCommandProcessor::PushImageMemoryBarrier(
             src_queue_family_index &&
         other_image_memory_barrier.dstQueueFamilyIndex ==
             dst_queue_family_index) {
-      // The barrier is already present.
+      // The barrier is already pending.
       current_pending_barrier_.src_stage_mask |= src_stage_mask;
       current_pending_barrier_.dst_stage_mask |= dst_stage_mask;
-      return;
+      return true;
     }
     SplitPendingBarrier();
     break;
@@ -992,6 +991,7 @@ void VulkanCommandProcessor::PushImageMemoryBarrier(
   image_memory_barrier.dstQueueFamilyIndex = dst_queue_family_index;
   image_memory_barrier.image = image;
   image_memory_barrier.subresourceRange = subresource_range;
+  return true;
 }
 
 bool VulkanCommandProcessor::SubmitBarriers(bool force_end_render_pass) {
@@ -1257,6 +1257,53 @@ void VulkanCommandProcessor::BindExternalGraphicsPipeline(
   current_guest_graphics_pipeline_layout_ = VK_NULL_HANDLE;
 }
 
+void VulkanCommandProcessor::BindExternalComputePipeline(VkPipeline pipeline) {
+  if (current_external_compute_pipeline_ == pipeline) {
+    return;
+  }
+  deferred_command_buffer_.CmdVkBindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE,
+                                             pipeline);
+  current_external_compute_pipeline_ = pipeline;
+}
+
+void VulkanCommandProcessor::SetViewport(const VkViewport& viewport) {
+  if (!dynamic_viewport_update_needed_) {
+    dynamic_viewport_update_needed_ |= dynamic_viewport_.x != viewport.x;
+    dynamic_viewport_update_needed_ |= dynamic_viewport_.y != viewport.y;
+    dynamic_viewport_update_needed_ |=
+        dynamic_viewport_.width != viewport.width;
+    dynamic_viewport_update_needed_ |=
+        dynamic_viewport_.height != viewport.height;
+    dynamic_viewport_update_needed_ |=
+        dynamic_viewport_.minDepth != viewport.minDepth;
+    dynamic_viewport_update_needed_ |=
+        dynamic_viewport_.maxDepth != viewport.maxDepth;
+  }
+  if (dynamic_viewport_update_needed_) {
+    dynamic_viewport_ = viewport;
+    deferred_command_buffer_.CmdVkSetViewport(0, 1, &dynamic_viewport_);
+    dynamic_viewport_update_needed_ = false;
+  }
+}
+
+void VulkanCommandProcessor::SetScissor(const VkRect2D& scissor) {
+  if (!dynamic_scissor_update_needed_) {
+    dynamic_scissor_update_needed_ |=
+        dynamic_scissor_.offset.x != scissor.offset.x;
+    dynamic_scissor_update_needed_ |=
+        dynamic_scissor_.offset.y != scissor.offset.y;
+    dynamic_scissor_update_needed_ |=
+        dynamic_scissor_.extent.width != scissor.extent.width;
+    dynamic_scissor_update_needed_ |=
+        dynamic_scissor_.extent.height != scissor.extent.height;
+  }
+  if (dynamic_scissor_update_needed_) {
+    dynamic_scissor_ = scissor;
+    deferred_command_buffer_.CmdVkSetScissor(0, 1, &dynamic_scissor_);
+    dynamic_scissor_update_needed_ = false;
+  }
+}
+
 Shader* VulkanCommandProcessor::LoadShader(xenos::ShaderType shader_type,
                                            uint32_t guest_address,
                                            const uint32_t* host_address,
@@ -1417,8 +1464,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
   }
 
   const ui::vulkan::VulkanProvider& provider = GetVulkanProvider();
-  const VkPhysicalDeviceProperties& device_properties =
-      provider.device_properties();
+  const VkPhysicalDeviceLimits& device_limits =
+      provider.device_properties().limits;
 
   // Get dynamic rasterizer state.
   draw_util::ViewportInfo viewport_info;
@@ -1438,10 +1485,10 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
   // life. Or even disregard the viewport bounds range in the fragment shader
   // interlocks case completely - apply the viewport and the scissor offset
   // directly to pixel address and to things like ps_param_gen.
-  draw_util::GetHostViewportInfo(
-      regs, 1, 1, false, device_properties.limits.maxViewportDimensions[0],
-      device_properties.limits.maxViewportDimensions[1], true, false, false,
-      false, viewport_info);
+  draw_util::GetHostViewportInfo(regs, 1, 1, false,
+                                 device_limits.maxViewportDimensions[0],
+                                 device_limits.maxViewportDimensions[1], true,
+                                 false, false, false, viewport_info);
 
   // Update dynamic graphics pipeline state.
   UpdateDynamicState(viewport_info, primitive_polygonal);
@@ -1675,6 +1722,8 @@ void VulkanCommandProcessor::CheckSubmissionFenceAndDeviceLoss(
 
   primitive_processor_->CompletedSubmissionUpdated();
 
+  render_target_cache_->CompletedSubmissionUpdated();
+
   // Destroy outdated swap objects.
   while (!swap_framebuffers_outdated_.empty()) {
     const auto& framebuffer_pair = swap_framebuffers_outdated_.front();
@@ -1752,6 +1801,7 @@ bool VulkanCommandProcessor::BeginSubmission(bool is_guest_command) {
     current_framebuffer_ = nullptr;
     current_guest_graphics_pipeline_ = VK_NULL_HANDLE;
     current_external_graphics_pipeline_ = VK_NULL_HANDLE;
+    current_external_compute_pipeline_ = VK_NULL_HANDLE;
     current_guest_graphics_pipeline_layout_ = nullptr;
     current_graphics_descriptor_sets_bound_up_to_date_ = 0;
 
@@ -1861,6 +1911,8 @@ bool VulkanCommandProcessor::EndSubmission(bool is_swap) {
   if (submission_open_) {
     EndRenderPass();
 
+    render_target_cache_->EndSubmission();
+
     primitive_processor_->EndSubmission();
 
     shared_memory_->EndSubmission();
@@ -2112,20 +2164,7 @@ void VulkanCommandProcessor::UpdateDynamicState(
   }
   viewport.minDepth = viewport_info.z_min;
   viewport.maxDepth = viewport_info.z_max;
-  dynamic_viewport_update_needed_ |= dynamic_viewport_.x != viewport.x;
-  dynamic_viewport_update_needed_ |= dynamic_viewport_.y != viewport.y;
-  dynamic_viewport_update_needed_ |= dynamic_viewport_.width != viewport.width;
-  dynamic_viewport_update_needed_ |=
-      dynamic_viewport_.height != viewport.height;
-  dynamic_viewport_update_needed_ |=
-      dynamic_viewport_.minDepth != viewport.minDepth;
-  dynamic_viewport_update_needed_ |=
-      dynamic_viewport_.maxDepth != viewport.maxDepth;
-  if (dynamic_viewport_update_needed_) {
-    dynamic_viewport_ = viewport;
-    deferred_command_buffer_.CmdVkSetViewport(0, 1, &dynamic_viewport_);
-    dynamic_viewport_update_needed_ = false;
-  }
+  SetViewport(viewport);
 
   // Scissor.
   draw_util::Scissor scissor;
@@ -2135,19 +2174,7 @@ void VulkanCommandProcessor::UpdateDynamicState(
   scissor_rect.offset.y = int32_t(scissor.offset[1]);
   scissor_rect.extent.width = scissor.extent[0];
   scissor_rect.extent.height = scissor.extent[1];
-  dynamic_scissor_update_needed_ |=
-      dynamic_scissor_.offset.x != scissor_rect.offset.x;
-  dynamic_scissor_update_needed_ |=
-      dynamic_scissor_.offset.y != scissor_rect.offset.y;
-  dynamic_scissor_update_needed_ |=
-      dynamic_scissor_.extent.width != scissor_rect.extent.width;
-  dynamic_scissor_update_needed_ |=
-      dynamic_scissor_.extent.height != scissor_rect.extent.height;
-  if (dynamic_scissor_update_needed_) {
-    dynamic_scissor_ = scissor_rect;
-    deferred_command_buffer_.CmdVkSetScissor(0, 1, &dynamic_scissor_);
-    dynamic_scissor_update_needed_ = false;
-  }
+  SetScissor(scissor_rect);
 
   // Depth bias.
   // TODO(Triang3l): Disable the depth bias for the fragment shader interlock RB
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h
index 551a3fcae..54c25d22f 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@@ -2,7 +2,7 @@
  ******************************************************************************
  * Xenia : Xbox 360 Emulator Research Project                                 *
  ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
  * Released under the BSD license - see LICENSE in the root for more details. *
  ******************************************************************************
  */
@@ -81,15 +81,16 @@ class VulkanCommandProcessor : public CommandProcessor {
   uint64_t GetCurrentFrame() const { return frame_current_; }
   uint64_t GetCompletedFrame() const { return frame_completed_; }
 
-  // Submission must be open to insert barriers.
-  void PushBufferMemoryBarrier(
+  // Submission must be open to insert barriers. Returning true if the barrier
+  // has actually been inserted and not dropped.
+  bool PushBufferMemoryBarrier(
       VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size,
       VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask,
       VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask,
       uint32_t src_queue_family_index = VK_QUEUE_FAMILY_IGNORED,
       uint32_t dst_queue_family_index = VK_QUEUE_FAMILY_IGNORED,
       bool skip_if_equal = true);
-  void PushImageMemoryBarrier(
+  bool PushImageMemoryBarrier(
       VkImage image, const VkImageSubresourceRange& subresource_range,
       VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask,
       VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask,
@@ -125,6 +126,9 @@ class VulkanCommandProcessor : public CommandProcessor {
                                     bool keep_dynamic_depth_bias = false,
                                     bool keep_dynamic_blend_constants = false,
                                     bool keep_dynamic_stencil_mask_ref = false);
+  void BindExternalComputePipeline(VkPipeline pipeline);
+  void SetViewport(const VkViewport& viewport);
+  void SetScissor(const VkRect2D& scissor);
 
  protected:
   bool SetupContext() override;
@@ -211,6 +215,9 @@ class VulkanCommandProcessor : public CommandProcessor {
   // open non-frame submission, BeginSubmission(true) will promote it to a
   // frame. EndSubmission(true) will close the frame no matter whether the
   // submission has already been closed.
+  // Unlike on Direct3D 12, submission boundaries do not imply any memory
+  // barriers aside from an incoming host write (but not outgoing host read)
+  // dependency.
 
   // Rechecks submission number and reclaims per-submission resources. Pass 0 as
   // the submission to await to simply check status, or pass
@@ -396,6 +403,7 @@ class VulkanCommandProcessor : public CommandProcessor {
   // TODO(Triang3l): Change to a deferred compilation handle.
   VkPipeline current_guest_graphics_pipeline_;
   VkPipeline current_external_graphics_pipeline_;
+  VkPipeline current_external_compute_pipeline_;
 
   // Pipeline layout of the current guest graphics pipeline.
   const PipelineLayout* current_guest_graphics_pipeline_layout_;
diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc
index 8f581f0fa..450a346b0 100644
--- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc
@@ -884,11 +884,25 @@ bool VulkanPipelineCache::EnsurePipelineCreated(
   // TODO(Triang3l): Wide lines.
   rasterization_state.lineWidth = 1.0f;
 
+  VkSampleMask sample_mask = UINT32_MAX;
   VkPipelineMultisampleStateCreateInfo multisample_state = {};
   multisample_state.sType =
       VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
-  multisample_state.rasterizationSamples = VkSampleCountFlagBits(
-      uint32_t(1) << uint32_t(description.render_pass_key.msaa_samples));
+  if (description.render_pass_key.msaa_samples == xenos::MsaaSamples::k2X &&
+      !render_target_cache_.IsMsaa2xSupported(
+          description.render_pass_key.depth_and_color_used != 0)) {
+    // Using sample 0 as 0 and 3 as 1 for 2x instead (not exactly the same
+    // sample locations, but still top-left and bottom-right - however, this can
+    // be adjusted with custom sample locations).
+    multisample_state.rasterizationSamples = VK_SAMPLE_COUNT_4_BIT;
+    sample_mask = 0b1001;
+    // TODO(Triang3l): Research sample mask behavior without attachments (in
+    // Direct3D, it's completely ignored in this case).
+    multisample_state.pSampleMask = &sample_mask;
+  } else {
+    multisample_state.rasterizationSamples = VkSampleCountFlagBits(
+        uint32_t(1) << uint32_t(description.render_pass_key.msaa_samples));
+  }
 
   VkPipelineDepthStencilStateCreateInfo depth_stencil_state = {};
   depth_stencil_state.sType =
@@ -1061,7 +1075,7 @@ bool VulkanPipelineCache::EnsurePipelineCreated(
   pipeline_create_info.renderPass = creation_arguments.render_pass;
   pipeline_create_info.subpass = 0;
   pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE;
-  pipeline_create_info.basePipelineIndex = UINT32_MAX;
+  pipeline_create_info.basePipelineIndex = -1;
 
   const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
   VkDevice device = provider.device();
diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc
index 24eb8e14b..b029f64dd 100644
--- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc
+++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc
@@ -10,23 +10,109 @@
 #include "xenia/gpu/vulkan/vulkan_render_target_cache.h"
 
 #include <algorithm>
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
+#include <memory>
 #include <tuple>
 #include <utility>
+#include <vector>
 
+#include "third_party/glslang/SPIRV/GLSL.std.450.h"
+#include "third_party/glslang/SPIRV/SpvBuilder.h"
 #include "xenia/base/assert.h"
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
+#include "xenia/gpu/draw_util.h"
 #include "xenia/gpu/registers.h"
+#include "xenia/gpu/spirv_shader_translator.h"
+#include "xenia/gpu/vulkan/deferred_command_buffer.h"
 #include "xenia/gpu/vulkan/vulkan_command_processor.h"
+#include "xenia/gpu/xenos.h"
 #include "xenia/ui/vulkan/vulkan_util.h"
 
 namespace xe {
 namespace gpu {
 namespace vulkan {
 
+// Generated with `xb buildshaders`.
+namespace shaders {
+#include "xenia/gpu/shaders/bytecode/vulkan_spirv/host_depth_store_1xmsaa_cs.h"
+#include "xenia/gpu/shaders/bytecode/vulkan_spirv/host_depth_store_2xmsaa_cs.h"
+#include "xenia/gpu/shaders/bytecode/vulkan_spirv/host_depth_store_4xmsaa_cs.h"
+#include "xenia/gpu/shaders/bytecode/vulkan_spirv/passthrough_position_xy_vs.h"
+}  // namespace shaders
+
+const VulkanRenderTargetCache::TransferPipelineLayoutInfo
+    VulkanRenderTargetCache::kTransferPipelineLayoutInfos[size_t(
+        TransferPipelineLayoutIndex::kCount)] = {
+        // kColor
+        {kTransferUsedDescriptorSetColorTextureBit,
+         kTransferUsedPushConstantDwordAddressBit},
+        // kDepth
+        {kTransferUsedDescriptorSetDepthStencilTexturesBit,
+         kTransferUsedPushConstantDwordAddressBit},
+        // kColorToStencilBit
+        {kTransferUsedDescriptorSetColorTextureBit,
+         kTransferUsedPushConstantDwordAddressBit |
+             kTransferUsedPushConstantDwordStencilMaskBit},
+        // kDepthToStencilBit
+        {kTransferUsedDescriptorSetDepthStencilTexturesBit,
+         kTransferUsedPushConstantDwordAddressBit |
+             kTransferUsedPushConstantDwordStencilMaskBit},
+        // kColorAndHostDepthTexture
+        {kTransferUsedDescriptorSetHostDepthStencilTexturesBit |
+             kTransferUsedDescriptorSetColorTextureBit,
+         kTransferUsedPushConstantDwordHostDepthAddressBit |
+             kTransferUsedPushConstantDwordAddressBit},
+        // kColorAndHostDepthBuffer
+        {kTransferUsedDescriptorSetHostDepthBufferBit |
+             kTransferUsedDescriptorSetColorTextureBit,
+         kTransferUsedPushConstantDwordHostDepthAddressBit |
+             kTransferUsedPushConstantDwordAddressBit},
+        // kDepthAndHostDepthTexture
+        {kTransferUsedDescriptorSetHostDepthStencilTexturesBit |
+             kTransferUsedDescriptorSetDepthStencilTexturesBit,
+         kTransferUsedPushConstantDwordHostDepthAddressBit |
+             kTransferUsedPushConstantDwordAddressBit},
+        // kDepthAndHostDepthBuffer
+        {kTransferUsedDescriptorSetHostDepthBufferBit |
+             kTransferUsedDescriptorSetDepthStencilTexturesBit,
+         kTransferUsedPushConstantDwordHostDepthAddressBit |
+             kTransferUsedPushConstantDwordAddressBit},
+};
+
+const VulkanRenderTargetCache::TransferModeInfo
+    VulkanRenderTargetCache::kTransferModes[size_t(TransferMode::kCount)] = {
+        // kColorToDepth
+        {TransferOutput::kDepth, TransferPipelineLayoutIndex::kColor},
+        // kColorToColor
+        {TransferOutput::kColor, TransferPipelineLayoutIndex::kColor},
+        // kDepthToDepth
+        {TransferOutput::kDepth, TransferPipelineLayoutIndex::kDepth},
+        // kDepthToColor
+        {TransferOutput::kColor, TransferPipelineLayoutIndex::kDepth},
+        // kColorToStencilBit
+        {TransferOutput::kStencilBit,
+         TransferPipelineLayoutIndex::kColorToStencilBit},
+        // kDepthToStencilBit
+        {TransferOutput::kStencilBit,
+         TransferPipelineLayoutIndex::kDepthToStencilBit},
+        // kColorAndHostDepthToDepth
+        {TransferOutput::kDepth,
+         TransferPipelineLayoutIndex::kColorAndHostDepthTexture},
+        // kDepthAndHostDepthToDepth
+        {TransferOutput::kDepth,
+         TransferPipelineLayoutIndex::kDepthAndHostDepthTexture},
+        // kColorAndHostDepthCopyToDepth
+        {TransferOutput::kDepth,
+         TransferPipelineLayoutIndex::kColorAndHostDepthBuffer},
+        // kDepthAndHostDepthCopyToDepth
+        {TransferOutput::kDepth,
+         TransferPipelineLayoutIndex::kDepthAndHostDepthBuffer},
+};
+
 VulkanRenderTargetCache::VulkanRenderTargetCache(
     VulkanCommandProcessor& command_processor,
     const RegisterFile& register_file)
@@ -35,6 +121,342 @@ VulkanRenderTargetCache::VulkanRenderTargetCache(
 VulkanRenderTargetCache::~VulkanRenderTargetCache() { Shutdown(true); }
 
 bool VulkanRenderTargetCache::Initialize() {
+  const ui::vulkan::VulkanProvider& provider =
+      command_processor_.GetVulkanProvider();
+  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
+  VkDevice device = provider.device();
+
+  // Descriptor set layouts.
+  VkDescriptorSetLayoutBinding descriptor_set_layout_bindings[2];
+  descriptor_set_layout_bindings[0].binding = 0;
+  descriptor_set_layout_bindings[0].descriptorType =
+      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+  descriptor_set_layout_bindings[0].descriptorCount = 1;
+  descriptor_set_layout_bindings[0].stageFlags =
+      VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_COMPUTE_BIT;
+  descriptor_set_layout_bindings[0].pImmutableSamplers = nullptr;
+  VkDescriptorSetLayoutCreateInfo descriptor_set_layout_create_info;
+  descriptor_set_layout_create_info.sType =
+      VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+  descriptor_set_layout_create_info.pNext = nullptr;
+  descriptor_set_layout_create_info.flags = 0;
+  descriptor_set_layout_create_info.bindingCount = 1;
+  descriptor_set_layout_create_info.pBindings = descriptor_set_layout_bindings;
+  if (dfn.vkCreateDescriptorSetLayout(
+          device, &descriptor_set_layout_create_info, nullptr,
+          &descriptor_set_layout_storage_buffer_) != VK_SUCCESS) {
+    XELOGE(
+        "VulkanRenderTargetCache: Failed to create the descriptor set layout "
+        "with one storage buffer");
+    Shutdown();
+    return false;
+  }
+  descriptor_set_layout_bindings[0].descriptorType =
+      VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
+  if (dfn.vkCreateDescriptorSetLayout(
+          device, &descriptor_set_layout_create_info, nullptr,
+          &descriptor_set_layout_sampled_image_) != VK_SUCCESS) {
+    XELOGE(
+        "VulkanRenderTargetCache: Failed to create the descriptor set layout "
+        "with one sampled image");
+    Shutdown();
+    return false;
+  }
+  descriptor_set_layout_bindings[1].binding = 1;
+  descriptor_set_layout_bindings[1].descriptorType =
+      VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
+  descriptor_set_layout_bindings[1].descriptorCount = 1;
+  descriptor_set_layout_bindings[1].stageFlags =
+      descriptor_set_layout_bindings[0].stageFlags;
+  descriptor_set_layout_bindings[1].pImmutableSamplers = nullptr;
+  descriptor_set_layout_create_info.bindingCount = 2;
+  if (dfn.vkCreateDescriptorSetLayout(
+          device, &descriptor_set_layout_create_info, nullptr,
+          &descriptor_set_layout_sampled_image_x2_) != VK_SUCCESS) {
+    XELOGE(
+        "VulkanRenderTargetCache: Failed to create the descriptor set layout "
+        "with two sampled images");
+    Shutdown();
+    return false;
+  }
+
+  // Descriptor set pools.
+  // The pool sizes were chosen without a specific reason.
+  VkDescriptorPoolSize descriptor_set_layout_size;
+  descriptor_set_layout_size.type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
+  descriptor_set_layout_size.descriptorCount = 1;
+  descriptor_set_pool_sampled_image_ =
+      std::make_unique<ui::vulkan::SingleLayoutDescriptorSetPool>(
+          provider, 256, 1, &descriptor_set_layout_size,
+          descriptor_set_layout_sampled_image_);
+  descriptor_set_layout_size.descriptorCount = 2;
+  descriptor_set_pool_sampled_image_x2_ =
+      std::make_unique<ui::vulkan::SingleLayoutDescriptorSetPool>(
+          provider, 256, 1, &descriptor_set_layout_size,
+          descriptor_set_layout_sampled_image_x2_);
+
+  // EDRAM contents reinterpretation buffer.
+  // 90 MB with 9x resolution scaling - within the minimum
+  // maxStorageBufferRange.
+  if (!ui::vulkan::util::CreateDedicatedAllocationBuffer(
+          provider,
+          VkDeviceSize(xenos::kEdramSizeBytes * resolution_scale_x_ *
+                       resolution_scale_y_),
+          VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+              VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+          ui::vulkan::util::MemoryPurpose::kDeviceLocal, edram_buffer_,
+          edram_buffer_memory_)) {
+    XELOGE("VulkanRenderTargetCache: Failed to create the EDRAM buffer");
+    Shutdown();
+    return false;
+  }
+  if (GetPath() == Path::kPixelShaderInterlock) {
+    // The first operation will likely be drawing.
+    edram_buffer_usage_ = EdramBufferUsage::kFragmentReadWrite;
+  } else {
+    // The first operation will likely be depth self-comparison.
+    edram_buffer_usage_ = EdramBufferUsage::kFragmentRead;
+  }
+  edram_buffer_modification_status_ =
+      EdramBufferModificationStatus::kUnmodified;
+  VkDescriptorPoolSize edram_storage_buffer_descriptor_pool_size;
+  edram_storage_buffer_descriptor_pool_size.type =
+      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+  edram_storage_buffer_descriptor_pool_size.descriptorCount = 1;
+  VkDescriptorPoolCreateInfo edram_storage_buffer_descriptor_pool_create_info;
+  edram_storage_buffer_descriptor_pool_create_info.sType =
+      VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+  edram_storage_buffer_descriptor_pool_create_info.pNext = nullptr;
+  edram_storage_buffer_descriptor_pool_create_info.flags = 0;
+  edram_storage_buffer_descriptor_pool_create_info.maxSets = 1;
+  edram_storage_buffer_descriptor_pool_create_info.poolSizeCount = 1;
+  edram_storage_buffer_descriptor_pool_create_info.pPoolSizes =
+      &edram_storage_buffer_descriptor_pool_size;
+  if (dfn.vkCreateDescriptorPool(
+          device, &edram_storage_buffer_descriptor_pool_create_info, nullptr,
+          &edram_storage_buffer_descriptor_pool_) != VK_SUCCESS) {
+    XELOGE(
+        "VulkanRenderTargetCache: Failed to create the EDRAM buffer storage "
+        "buffer descriptor pool");
+    Shutdown();
+    return false;
+  }
+  VkDescriptorSetAllocateInfo edram_storage_buffer_descriptor_set_allocate_info;
+  edram_storage_buffer_descriptor_set_allocate_info.sType =
+      VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+  edram_storage_buffer_descriptor_set_allocate_info.pNext = nullptr;
+  edram_storage_buffer_descriptor_set_allocate_info.descriptorPool =
+      edram_storage_buffer_descriptor_pool_;
+  edram_storage_buffer_descriptor_set_allocate_info.descriptorSetCount = 1;
+  edram_storage_buffer_descriptor_set_allocate_info.pSetLayouts =
+      &descriptor_set_layout_storage_buffer_;
+  if (dfn.vkAllocateDescriptorSets(
+          device, &edram_storage_buffer_descriptor_set_allocate_info,
+          &edram_storage_buffer_descriptor_set_) != VK_SUCCESS) {
+    XELOGE(
+        "VulkanRenderTargetCache: Failed to allocate the EDRAM buffer storage "
+        "buffer descriptor set");
+    Shutdown();
+    return false;
+  }
+  VkDescriptorBufferInfo edram_storage_buffer_descriptor_buffer_info;
+  edram_storage_buffer_descriptor_buffer_info.buffer = edram_buffer_;
+  edram_storage_buffer_descriptor_buffer_info.offset = 0;
+  edram_storage_buffer_descriptor_buffer_info.range = VK_WHOLE_SIZE;
+  VkWriteDescriptorSet edram_storage_buffer_descriptor_write;
+  edram_storage_buffer_descriptor_write.sType =
+      VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+  edram_storage_buffer_descriptor_write.pNext = nullptr;
+  edram_storage_buffer_descriptor_write.dstSet =
+      edram_storage_buffer_descriptor_set_;
+  edram_storage_buffer_descriptor_write.dstBinding = 0;
+  edram_storage_buffer_descriptor_write.dstArrayElement = 0;
+  edram_storage_buffer_descriptor_write.descriptorCount = 1;
+  edram_storage_buffer_descriptor_write.descriptorType =
+      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+  edram_storage_buffer_descriptor_write.pImageInfo = nullptr;
+  edram_storage_buffer_descriptor_write.pBufferInfo =
+      &edram_storage_buffer_descriptor_buffer_info;
+  edram_storage_buffer_descriptor_write.pTexelBufferView = nullptr;
+  dfn.vkUpdateDescriptorSets(device, 1, &edram_storage_buffer_descriptor_write,
+                             0, nullptr);
+
+  // TODO(Triang3l): All paths (FSI).
+
+  // TODO(Triang3l): Handle sampledImageIntegerSampleCounts 4 not supported in
+  // transfers.
+  if (cvars::native_2x_msaa) {
+    const VkPhysicalDeviceLimits& device_limits =
+        provider.device_properties().limits;
+    // Multisampled integer sampled images are optional in Vulkan and in Xenia.
+    msaa_2x_attachments_supported_ =
+        (device_limits.framebufferColorSampleCounts &
+         device_limits.framebufferDepthSampleCounts &
+         device_limits.framebufferStencilSampleCounts &
+         device_limits.sampledImageColorSampleCounts &
+         device_limits.sampledImageDepthSampleCounts &
+         device_limits.sampledImageStencilSampleCounts &
+         VK_SAMPLE_COUNT_2_BIT) &&
+        (device_limits.sampledImageIntegerSampleCounts &
+         (VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT)) !=
+            VK_SAMPLE_COUNT_4_BIT;
+    msaa_2x_no_attachments_supported_ =
+        (device_limits.framebufferNoAttachmentsSampleCounts &
+         VK_SAMPLE_COUNT_2_BIT) != 0;
+  } else {
+    msaa_2x_attachments_supported_ = false;
+    msaa_2x_no_attachments_supported_ = false;
+  }
+
+  // Host depth storing pipeline layout.
+  VkDescriptorSetLayout host_depth_store_descriptor_set_layouts[] = {
+      // Destination EDRAM storage buffer.
+      descriptor_set_layout_storage_buffer_,
+      // Source depth / stencil texture (only depth is used).
+      descriptor_set_layout_sampled_image_x2_,
+  };
+  VkPushConstantRange host_depth_store_push_constant_range;
+  host_depth_store_push_constant_range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+  host_depth_store_push_constant_range.offset = 0;
+  host_depth_store_push_constant_range.size = sizeof(HostDepthStoreConstants);
+  VkPipelineLayoutCreateInfo host_depth_store_pipeline_layout_create_info;
+  host_depth_store_pipeline_layout_create_info.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+  host_depth_store_pipeline_layout_create_info.pNext = nullptr;
+  host_depth_store_pipeline_layout_create_info.flags = 0;
+  host_depth_store_pipeline_layout_create_info.setLayoutCount =
+      uint32_t(xe::countof(host_depth_store_descriptor_set_layouts));
+  host_depth_store_pipeline_layout_create_info.pSetLayouts =
+      host_depth_store_descriptor_set_layouts;
+  host_depth_store_pipeline_layout_create_info.pushConstantRangeCount = 1;
+  host_depth_store_pipeline_layout_create_info.pPushConstantRanges =
+      &host_depth_store_push_constant_range;
+  if (dfn.vkCreatePipelineLayout(
+          device, &host_depth_store_pipeline_layout_create_info, nullptr,
+          &host_depth_store_pipeline_layout_) != VK_SUCCESS) {
+    XELOGE(
+        "VulkanRenderTargetCache: Failed to create the host depth storing "
+        "pipeline layout");
+    Shutdown();
+    return false;
+  }
+  const std::pair<const uint32_t*, size_t> host_depth_store_shaders[] = {
+      {shaders::host_depth_store_1xmsaa_cs,
+       sizeof(shaders::host_depth_store_1xmsaa_cs)},
+      {shaders::host_depth_store_2xmsaa_cs,
+       sizeof(shaders::host_depth_store_2xmsaa_cs)},
+      {shaders::host_depth_store_4xmsaa_cs,
+       sizeof(shaders::host_depth_store_4xmsaa_cs)},
+  };
+  for (size_t i = 0; i < xe::countof(host_depth_store_shaders); ++i) {
+    const std::pair<const uint32_t*, size_t> host_depth_store_shader =
+        host_depth_store_shaders[i];
+    VkPipeline host_depth_store_pipeline =
+        ui::vulkan::util::CreateComputePipeline(
+            provider, host_depth_store_pipeline_layout_,
+            host_depth_store_shader.first, host_depth_store_shader.second);
+    if (host_depth_store_pipeline == VK_NULL_HANDLE) {
+      XELOGE(
+          "VulkanRenderTargetCache: Failed to create the {}-sample host depth "
+          "storing pipeline",
+          uint32_t(1) << i);
+      Shutdown();
+      return false;
+    }
+    host_depth_store_pipelines_[i] = host_depth_store_pipeline;
+  }
+
+  // Transfer and clear vertex buffer, for quads of up to tile granularity.
+  transfer_vertex_buffer_pool_ =
+      std::make_unique<ui::vulkan::VulkanUploadBufferPool>(
+          provider, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+          std::max(ui::vulkan::VulkanUploadBufferPool::kDefaultPageSize,
+                   sizeof(float) * 2 * 6 *
+                       Transfer::kMaxCutoutBorderRectangles *
+                       xenos::kEdramTileCount));
+
+  // Transfer vertex shader.
+  transfer_passthrough_vertex_shader_ = ui::vulkan::util::CreateShaderModule(
+      provider, shaders::passthrough_position_xy_vs,
+      sizeof(shaders::passthrough_position_xy_vs));
+  if (transfer_passthrough_vertex_shader_ == VK_NULL_HANDLE) {
+    XELOGE(
+        "VulkanRenderTargetCache: Failed to create the render target ownership "
+        "transfer vertex shader");
+    Shutdown();
+    return false;
+  }
+
+  // Transfer pipeline layouts.
+  VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layouts
+      [kTransferUsedDescriptorSetCount];
+  VkPushConstantRange transfer_pipeline_layout_push_constant_range;
+  transfer_pipeline_layout_push_constant_range.stageFlags =
+      VK_SHADER_STAGE_FRAGMENT_BIT;
+  transfer_pipeline_layout_push_constant_range.offset = 0;
+  VkPipelineLayoutCreateInfo transfer_pipeline_layout_create_info;
+  transfer_pipeline_layout_create_info.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+  transfer_pipeline_layout_create_info.pNext = nullptr;
+  transfer_pipeline_layout_create_info.flags = 0;
+  transfer_pipeline_layout_create_info.pSetLayouts =
+      transfer_pipeline_layout_descriptor_set_layouts;
+  transfer_pipeline_layout_create_info.pPushConstantRanges =
+      &transfer_pipeline_layout_push_constant_range;
+  for (size_t i = 0; i < size_t(TransferPipelineLayoutIndex::kCount); ++i) {
+    const TransferPipelineLayoutInfo& transfer_pipeline_layout_info =
+        kTransferPipelineLayoutInfos[i];
+    transfer_pipeline_layout_create_info.setLayoutCount = 0;
+    uint32_t transfer_pipeline_layout_descriptor_sets_remaining =
+        transfer_pipeline_layout_info.used_descriptor_sets;
+    uint32_t transfer_pipeline_layout_descriptor_set_index;
+    while (
+        xe::bit_scan_forward(transfer_pipeline_layout_descriptor_sets_remaining,
+                             &transfer_pipeline_layout_descriptor_set_index)) {
+      transfer_pipeline_layout_descriptor_sets_remaining &=
+          ~(uint32_t(1) << transfer_pipeline_layout_descriptor_set_index);
+      VkDescriptorSetLayout transfer_pipeline_layout_descriptor_set_layout =
+          VK_NULL_HANDLE;
+      switch (TransferUsedDescriptorSet(
+          transfer_pipeline_layout_descriptor_set_index)) {
+        case kTransferUsedDescriptorSetHostDepthBuffer:
+          transfer_pipeline_layout_descriptor_set_layout =
+              descriptor_set_layout_storage_buffer_;
+          break;
+        case kTransferUsedDescriptorSetHostDepthStencilTextures:
+        case kTransferUsedDescriptorSetDepthStencilTextures:
+          transfer_pipeline_layout_descriptor_set_layout =
+              descriptor_set_layout_sampled_image_x2_;
+          break;
+        case kTransferUsedDescriptorSetColorTexture:
+          transfer_pipeline_layout_descriptor_set_layout =
+              descriptor_set_layout_sampled_image_;
+          break;
+        default:
+          assert_unhandled_case(TransferUsedDescriptorSet(
+              transfer_pipeline_layout_descriptor_set_index));
+      }
+      transfer_pipeline_layout_descriptor_set_layouts
+          [transfer_pipeline_layout_create_info.setLayoutCount++] =
+              transfer_pipeline_layout_descriptor_set_layout;
+    }
+    transfer_pipeline_layout_push_constant_range.size = uint32_t(
+        sizeof(uint32_t) *
+        xe::bit_count(transfer_pipeline_layout_info.used_push_constant_dwords));
+    transfer_pipeline_layout_create_info.pushConstantRangeCount =
+        transfer_pipeline_layout_info.used_push_constant_dwords ? 1 : 0;
+    if (dfn.vkCreatePipelineLayout(
+            device, &transfer_pipeline_layout_create_info, nullptr,
+            &transfer_pipeline_layouts_[i]) != VK_SUCCESS) {
+      XELOGE(
+          "VulkanRenderTargetCache: Failed to create the render target "
+          "ownership transfer pipeline layout {}",
+          i);
+      Shutdown();
+      return false;
+    }
+  }
+
   InitializeCommon();
   return true;
 }
@@ -45,6 +467,36 @@ void VulkanRenderTargetCache::Shutdown(bool from_destructor) {
   const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
   VkDevice device = provider.device();
 
+  for (const auto& transfer_pipeline_array_pair : transfer_pipelines_) {
+    for (VkPipeline transfer_pipeline : transfer_pipeline_array_pair.second) {
+      // May be null to prevent recreation attempts.
+      if (transfer_pipeline != VK_NULL_HANDLE) {
+        dfn.vkDestroyPipeline(device, transfer_pipeline, nullptr);
+      }
+    }
+  }
+  transfer_pipelines_.clear();
+  for (const auto& transfer_shader_pair : transfer_shaders_) {
+    if (transfer_shader_pair.second != VK_NULL_HANDLE) {
+      dfn.vkDestroyShaderModule(device, transfer_shader_pair.second, nullptr);
+    }
+  }
+  transfer_shaders_.clear();
+  for (size_t i = 0; i < size_t(TransferPipelineLayoutIndex::kCount); ++i) {
+    ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipelineLayout, device,
+                                           transfer_pipeline_layouts_[i]);
+  }
+  ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyShaderModule, device,
+                                         transfer_passthrough_vertex_shader_);
+  transfer_vertex_buffer_pool_.reset();
+
+  for (size_t i = 0; i < xe::countof(host_depth_store_pipelines_); ++i) {
+    ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipeline, device,
+                                           host_depth_store_pipelines_[i]);
+  }
+  ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyPipelineLayout, device,
+                                         host_depth_store_pipeline_layout_);
+
   last_update_framebuffer_ = VK_NULL_HANDLE;
   for (const auto& framebuffer_pair : framebuffers_) {
     dfn.vkDestroyFramebuffer(device, framebuffer_pair.second.framebuffer,
@@ -54,10 +506,32 @@ void VulkanRenderTargetCache::Shutdown(bool from_destructor) {
 
   last_update_render_pass_ = VK_NULL_HANDLE;
   for (const auto& render_pass_pair : render_passes_) {
-    dfn.vkDestroyRenderPass(device, render_pass_pair.second, nullptr);
+    if (render_pass_pair.second != VK_NULL_HANDLE) {
+      dfn.vkDestroyRenderPass(device, render_pass_pair.second, nullptr);
+    }
   }
   render_passes_.clear();
 
+  ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorPool, device,
+                                         edram_storage_buffer_descriptor_pool_);
+  ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyBuffer, device,
+                                         edram_buffer_);
+  ui::vulkan::util::DestroyAndNullHandle(dfn.vkFreeMemory, device,
+                                         edram_buffer_memory_);
+
+  descriptor_set_pool_sampled_image_x2_.reset();
+  descriptor_set_pool_sampled_image_.reset();
+
+  ui::vulkan::util::DestroyAndNullHandle(
+      dfn.vkDestroyDescriptorSetLayout, device,
+      descriptor_set_layout_sampled_image_x2_);
+  ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorSetLayout,
+                                         device,
+                                         descriptor_set_layout_sampled_image_);
+  ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyDescriptorSetLayout,
+                                         device,
+                                         descriptor_set_layout_storage_buffer_);
+
   if (!from_destructor) {
     ShutdownCommon();
   }
@@ -87,6 +561,19 @@ void VulkanRenderTargetCache::ClearCache() {
   RenderTargetCache::ClearCache();
 }
 
+void VulkanRenderTargetCache::CompletedSubmissionUpdated() {
+  if (transfer_vertex_buffer_pool_) {
+    transfer_vertex_buffer_pool_->Reclaim(
+        command_processor_.GetCompletedSubmission());
+  }
+}
+
+void VulkanRenderTargetCache::EndSubmission() {
+  if (transfer_vertex_buffer_pool_) {
+    transfer_vertex_buffer_pool_->FlushWrites();
+  }
+}
+
 bool VulkanRenderTargetCache::Update(bool is_rasterization_done,
                                      uint32_t shader_writes_color_targets) {
   if (!RenderTargetCache::Update(is_rasterization_done,
@@ -94,9 +581,16 @@ bool VulkanRenderTargetCache::Update(bool is_rasterization_done,
     return false;
   }
 
-  auto rb_surface_info = register_file().Get<reg::RB_SURFACE_INFO>();
+  // TODO(Triang3l): All paths (FSI).
+
   RenderTarget* const* depth_and_color_render_targets =
       last_update_accumulated_render_targets();
+
+  PerformTransfersAndResolveClears(1 + xenos::kMaxColorRenderTargets,
+                                   depth_and_color_render_targets,
+                                   last_update_transfers());
+
+  auto rb_surface_info = register_file().Get<reg::RB_SURFACE_INFO>();
   uint32_t render_targets_are_srgb =
       gamma_render_target_as_srgb_
           ? last_update_accumulated_color_targets_are_gamma()
@@ -104,7 +598,6 @@ bool VulkanRenderTargetCache::Update(bool is_rasterization_done,
 
   RenderPassKey render_pass_key;
   render_pass_key.msaa_samples = rb_surface_info.msaa_samples;
-  // TODO(Triang3l): 2x MSAA as 4x.
   if (depth_and_color_render_targets[0]) {
     render_pass_key.depth_and_color_used |= 1 << 0;
     render_pass_key.depth_format =
@@ -220,9 +713,9 @@ VkRenderPass VulkanRenderTargetCache::GetRenderPass(RenderPassKey key) {
       samples = VK_SAMPLE_COUNT_1_BIT;
       break;
     case xenos::MsaaSamples::k2X:
-      // Using unconditionally because if 2x is emulated as 4x, the key will
-      // also contain 4x.
-      samples = VK_SAMPLE_COUNT_2_BIT;
+      samples = IsMsaa2xSupported(key.depth_and_color_used != 0)
+                    ? VK_SAMPLE_COUNT_2_BIT
+                    : VK_SAMPLE_COUNT_4_BIT;
       break;
     case xenos::MsaaSamples::k4X:
       samples = VK_SAMPLE_COUNT_4_BIT;
@@ -264,7 +757,11 @@ VkRenderPass VulkanRenderTargetCache::GetRenderPass(RenderPassKey key) {
     color_attachment.attachment = attachment_index;
     VkAttachmentDescription& attachment = attachments[attachment_index];
     attachment.flags = 0;
-    attachment.format = GetColorVulkanFormat(color_formats[i]);
+    xenos::ColorRenderTargetFormat color_format = color_formats[i];
+    attachment.format =
+        key.color_rts_use_transfer_formats
+            ? GetColorOwnershipTransferVulkanFormat(color_format)
+            : GetColorVulkanFormat(color_format);
     attachment.samples = samples;
     attachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
     attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
@@ -340,7 +837,8 @@ VkRenderPass VulkanRenderTargetCache::GetRenderPass(RenderPassKey key) {
   VkRenderPass render_pass;
   if (dfn.vkCreateRenderPass(device, &render_pass_create_info, nullptr,
                              &render_pass) != VK_SUCCESS) {
-    XELOGE("Failed to create a Vulkan render pass");
+    XELOGE("VulkanRenderTargetCache: Failed to create a render pass");
+    render_passes_.emplace(key.key, VK_NULL_HANDLE);
     return VK_NULL_HANDLE;
   }
   render_passes_.emplace(key.key, render_pass);
@@ -419,8 +917,15 @@ VkFormat VulkanRenderTargetCache::GetColorOwnershipTransferVulkanFormat(
 }
 
 VulkanRenderTargetCache::VulkanRenderTarget::~VulkanRenderTarget() {
-  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider_.dfn();
-  VkDevice device = provider_.device();
+  const ui::vulkan::VulkanProvider& provider =
+      render_target_cache_.command_processor_.GetVulkanProvider();
+  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
+  VkDevice device = provider.device();
+  ui::vulkan::SingleLayoutDescriptorSetPool& descriptor_set_pool =
+      key().is_depth
+          ? *render_target_cache_.descriptor_set_pool_sampled_image_x2_
+          : *render_target_cache_.descriptor_set_pool_sampled_image_;
+  descriptor_set_pool.Free(descriptor_set_index_transfer_source_);
   if (view_color_transfer_separate_ != VK_NULL_HANDLE) {
     dfn.vkDestroyImageView(device, view_color_transfer_separate_, nullptr);
   }
@@ -464,16 +969,20 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget(
   image_create_info.pNext = nullptr;
   image_create_info.flags = 0;
   image_create_info.imageType = VK_IMAGE_TYPE_2D;
-  // TODO(Triang3l): Resolution scaling.
-  image_create_info.extent.width = key.GetWidth();
+  image_create_info.extent.width = key.GetWidth() * resolution_scale_x_;
   image_create_info.extent.height =
-      GetRenderTargetHeight(key.pitch_tiles_at_32bpp, key.msaa_samples);
+      GetRenderTargetHeight(key.pitch_tiles_at_32bpp, key.msaa_samples) *
+      resolution_scale_y_;
   image_create_info.extent.depth = 1;
   image_create_info.mipLevels = 1;
   image_create_info.arrayLayers = 1;
-  // TODO(Triang3l): 2x MSAA as 4x.
-  image_create_info.samples =
-      VkSampleCountFlagBits(uint32_t(1) << uint32_t(key.msaa_samples));
+  if (key.msaa_samples == xenos::MsaaSamples::k2X &&
+      !msaa_2x_attachments_supported_) {
+    image_create_info.samples = VK_SAMPLE_COUNT_4_BIT;
+  } else {
+    image_create_info.samples =
+        VkSampleCountFlagBits(uint32_t(1) << uint32_t(key.msaa_samples));
+  }
   image_create_info.tiling = VK_IMAGE_TILING_OPTIMAL;
   image_create_info.usage = VK_IMAGE_USAGE_SAMPLED_BIT;
   image_create_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
@@ -509,7 +1018,11 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget(
   if (!ui::vulkan::util::CreateDedicatedAllocationImage(
           provider, image_create_info,
           ui::vulkan::util::MemoryPurpose::kDeviceLocal, image, memory)) {
-    // TODO(Triang3l): Error message.
+    XELOGE(
+        "VulkanRenderTarget: Failed to create a {}x{} {}xMSAA {} render target "
+        "image",
+        image_create_info.extent.width, image_create_info.extent.height,
+        uint32_t(1) << uint32_t(key.msaa_samples), key.GetFormatName());
     return nullptr;
   }
 
@@ -532,7 +1045,12 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget(
   VkImageView view_depth_color;
   if (dfn.vkCreateImageView(device, &view_create_info, nullptr,
                             &view_depth_color) != VK_SUCCESS) {
-    // TODO(Triang3l): Error message.
+    XELOGE(
+        "VulkanRenderTarget: Failed to create a {} view for a {}x{} {}xMSAA {} "
+        "render target",
+        key.is_depth ? "depth" : "color", image_create_info.extent.width,
+        image_create_info.extent.height,
+        uint32_t(1) << uint32_t(key.msaa_samples), key.GetFormatName());
     dfn.vkDestroyImage(device, image, nullptr);
     dfn.vkFreeMemory(device, memory, nullptr);
     return nullptr;
@@ -546,7 +1064,12 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget(
         VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
     if (dfn.vkCreateImageView(device, &view_create_info, nullptr,
                               &view_depth_stencil) != VK_SUCCESS) {
-      // TODO(Triang3l): Error message.
+      XELOGE(
+          "VulkanRenderTarget: Failed to create a depth / stencil view for a "
+          "{}x{} {}xMSAA {} render target",
+          image_create_info.extent.width, image_create_info.extent.height,
+          uint32_t(1) << uint32_t(key.msaa_samples),
+          xenos::GetDepthRenderTargetFormatName(key.GetDepthFormat()));
       dfn.vkDestroyImageView(device, view_depth_color, nullptr);
       dfn.vkDestroyImage(device, image, nullptr);
       dfn.vkFreeMemory(device, memory, nullptr);
@@ -555,7 +1078,12 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget(
     view_create_info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
     if (dfn.vkCreateImageView(device, &view_create_info, nullptr,
                               &view_stencil) != VK_SUCCESS) {
-      // TODO(Triang3l): Error message.
+      XELOGE(
+          "VulkanRenderTarget: Failed to create a stencil view for a {}x{} "
+          "{}xMSAA render target",
+          image_create_info.extent.width, image_create_info.extent.height,
+          uint32_t(1) << uint32_t(key.msaa_samples),
+          xenos::GetDepthRenderTargetFormatName(key.GetDepthFormat()));
       dfn.vkDestroyImageView(device, view_depth_stencil, nullptr);
       dfn.vkDestroyImageView(device, view_depth_color, nullptr);
       dfn.vkDestroyImage(device, image, nullptr);
@@ -567,7 +1095,12 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget(
       view_create_info.format = VK_FORMAT_R8G8B8A8_SRGB;
       if (dfn.vkCreateImageView(device, &view_create_info, nullptr,
                                 &view_srgb) != VK_SUCCESS) {
-        // TODO(Triang3l): Error message.
+        XELOGE(
+            "VulkanRenderTarget: Failed to create an sRGB view for a {}x{} "
+            "{}xMSAA render target",
+            image_create_info.extent.width, image_create_info.extent.height,
+            uint32_t(1) << uint32_t(key.msaa_samples),
+            xenos::GetColorRenderTargetFormatName(key.GetColorFormat()));
         dfn.vkDestroyImageView(device, view_depth_color, nullptr);
         dfn.vkDestroyImage(device, image, nullptr);
         dfn.vkFreeMemory(device, memory, nullptr);
@@ -578,7 +1111,11 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget(
       view_create_info.format = transfer_format;
       if (dfn.vkCreateImageView(device, &view_create_info, nullptr,
                                 &view_color_transfer_separate) != VK_SUCCESS) {
-        // TODO(Triang3l): Error message.
+        XELOGE(
+            "VulkanRenderTarget: Failed to create a transfer view for a {}x{} "
+            "{}xMSAA {} render target",
+            image_create_info.extent.width, image_create_info.extent.height,
+            uint32_t(1) << uint32_t(key.msaa_samples), key.GetFormatName());
         if (view_srgb != VK_NULL_HANDLE) {
           dfn.vkDestroyImageView(device, view_srgb, nullptr);
         }
@@ -590,11 +1127,170 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget(
     }
   }
 
-  VkImageView view_transfer_separate = VK_NULL_HANDLE;
+  ui::vulkan::SingleLayoutDescriptorSetPool& descriptor_set_pool =
+      key.is_depth ? *descriptor_set_pool_sampled_image_x2_
+                   : *descriptor_set_pool_sampled_image_;
+  size_t descriptor_set_index_transfer_source = descriptor_set_pool.Allocate();
+  if (descriptor_set_index_transfer_source == SIZE_MAX) {
+    XELOGE(
+        "VulkanRenderTargetCache: Failed to allocate sampled image descriptors "
+        "for a {} render target",
+        key.is_depth ? "depth/stencil" : "color");
+    if (view_color_transfer_separate != VK_NULL_HANDLE) {
+      dfn.vkDestroyImageView(device, view_color_transfer_separate, nullptr);
+    }
+    if (view_srgb != VK_NULL_HANDLE) {
+      dfn.vkDestroyImageView(device, view_srgb, nullptr);
+    }
+    dfn.vkDestroyImageView(device, view_depth_color, nullptr);
+    dfn.vkDestroyImage(device, image, nullptr);
+    dfn.vkFreeMemory(device, memory, nullptr);
+    return nullptr;
+  }
+  VkDescriptorSet descriptor_set_transfer_source =
+      descriptor_set_pool.Get(descriptor_set_index_transfer_source);
+  VkWriteDescriptorSet descriptor_set_write[2];
+  VkDescriptorImageInfo descriptor_set_write_depth_color;
+  descriptor_set_write_depth_color.sampler = VK_NULL_HANDLE;
+  descriptor_set_write_depth_color.imageView =
+      view_color_transfer_separate != VK_NULL_HANDLE
+          ? view_color_transfer_separate
+          : view_depth_color;
+  descriptor_set_write_depth_color.imageLayout =
+      VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+  descriptor_set_write[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+  descriptor_set_write[0].pNext = nullptr;
+  descriptor_set_write[0].dstSet = descriptor_set_transfer_source;
+  descriptor_set_write[0].dstBinding = 0;
+  descriptor_set_write[0].dstArrayElement = 0;
+  descriptor_set_write[0].descriptorCount = 1;
+  descriptor_set_write[0].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
+  descriptor_set_write[0].pImageInfo = &descriptor_set_write_depth_color;
+  descriptor_set_write[0].pBufferInfo = nullptr;
+  descriptor_set_write[0].pTexelBufferView = nullptr;
+  VkDescriptorImageInfo descriptor_set_write_stencil;
+  if (key.is_depth) {
+    descriptor_set_write_stencil.sampler = VK_NULL_HANDLE;
+    descriptor_set_write_stencil.imageView = view_stencil;
+    descriptor_set_write_stencil.imageLayout =
+        VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+    descriptor_set_write[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    descriptor_set_write[1].pNext = nullptr;
+    descriptor_set_write[1].dstSet = descriptor_set_transfer_source;
+    descriptor_set_write[1].dstBinding = 1;
+    descriptor_set_write[1].dstArrayElement = 0;
+    descriptor_set_write[1].descriptorCount = 1;
+    descriptor_set_write[1].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
+    descriptor_set_write[1].pImageInfo = &descriptor_set_write_stencil;
+    descriptor_set_write[1].pBufferInfo = nullptr;
+    descriptor_set_write[1].pTexelBufferView = nullptr;
+  }
+  dfn.vkUpdateDescriptorSets(device, key.is_depth ? 2 : 1, descriptor_set_write,
+                             0, nullptr);
 
-  return new VulkanRenderTarget(key, provider, image, memory, view_depth_color,
+  return new VulkanRenderTarget(key, *this, image, memory, view_depth_color,
                                 view_depth_stencil, view_stencil, view_srgb,
-                                view_color_transfer_separate);
+                                view_color_transfer_separate,
+                                descriptor_set_index_transfer_source);
+}
+
+void VulkanRenderTargetCache::GetEdramBufferUsageMasks(
+    EdramBufferUsage usage, VkPipelineStageFlags& stage_mask_out,
+    VkAccessFlags& access_mask_out) {
+  switch (usage) {
+    case EdramBufferUsage::kFragmentRead:
+      stage_mask_out = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+      access_mask_out = VK_ACCESS_SHADER_READ_BIT;
+      break;
+    case EdramBufferUsage::kFragmentReadWrite:
+      stage_mask_out = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+      access_mask_out = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+      break;
+    case EdramBufferUsage::kComputeRead:
+      stage_mask_out = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+      access_mask_out = VK_ACCESS_SHADER_READ_BIT;
+      break;
+    case EdramBufferUsage::kComputeWrite:
+      stage_mask_out = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+      access_mask_out = VK_ACCESS_SHADER_WRITE_BIT;
+      break;
+    case EdramBufferUsage::kTransferRead:
+      stage_mask_out = VK_PIPELINE_STAGE_TRANSFER_BIT;
+      access_mask_out = VK_ACCESS_TRANSFER_READ_BIT;
+      break;
+    case EdramBufferUsage::kTransferWrite:
+      stage_mask_out = VK_PIPELINE_STAGE_TRANSFER_BIT;
+      access_mask_out = VK_ACCESS_TRANSFER_WRITE_BIT;
+      break;
+    default:
+      assert_unhandled_case(usage);
+  }
+}
+
+void VulkanRenderTargetCache::UseEdramBuffer(EdramBufferUsage new_usage) {
+  if (edram_buffer_usage_ == new_usage) {
+    return;
+  }
+  VkPipelineStageFlags src_stage_mask, dst_stage_mask;
+  VkAccessFlags src_access_mask, dst_access_mask;
+  GetEdramBufferUsageMasks(edram_buffer_usage_, src_stage_mask,
+                           src_access_mask);
+  GetEdramBufferUsageMasks(new_usage, dst_stage_mask, dst_access_mask);
+  if (command_processor_.PushBufferMemoryBarrier(
+          edram_buffer_, 0, VK_WHOLE_SIZE, src_stage_mask, dst_stage_mask,
+          src_access_mask, dst_access_mask)) {
+    // Resetting edram_buffer_modification_status_ only if the barrier has been
+    // truly inserted.
+    edram_buffer_modification_status_ =
+        EdramBufferModificationStatus::kUnmodified;
+  }
+  edram_buffer_usage_ = new_usage;
+}
+
+void VulkanRenderTargetCache::MarkEdramBufferModified(
+    EdramBufferModificationStatus modification_status) {
+  assert_true(modification_status !=
+              EdramBufferModificationStatus::kUnmodified);
+  switch (edram_buffer_usage_) {
+    case EdramBufferUsage::kFragmentReadWrite:
+      // max because being modified via unordered access requires stricter
+      // synchronization than via fragment shader interlocks.
+      edram_buffer_modification_status_ =
+          std::max(edram_buffer_modification_status_, modification_status);
+      break;
+    case EdramBufferUsage::kComputeWrite:
+      assert_true(modification_status ==
+                  EdramBufferModificationStatus::kViaUnordered);
+      modification_status = EdramBufferModificationStatus::kViaUnordered;
+      break;
+    default:
+      assert_always(
+          "While changing the usage of the EDRAM buffer before marking it as "
+          "modified is handled safely (but will cause spurious marking as "
+          "modified after the changes have been implicitly committed by the "
+          "usage switch), normally that shouldn't be done and is an "
+          "indication of architectural mistakes. Alternatively, this may "
+          "indicate that the usage switch has been forgotten before writing, "
+          "which is a clearly invalid situation.");
+  }
+}
+
+void VulkanRenderTargetCache::CommitEdramBufferShaderWrites(
+    EdramBufferModificationStatus commit_status) {
+  assert_true(commit_status != EdramBufferModificationStatus::kUnmodified);
+  if (edram_buffer_modification_status_ < commit_status) {
+    return;
+  }
+  VkPipelineStageFlags stage_mask;
+  VkAccessFlags access_mask;
+  GetEdramBufferUsageMasks(edram_buffer_usage_, stage_mask, access_mask);
+  assert_not_zero(access_mask & VK_ACCESS_SHADER_WRITE_BIT);
+  command_processor_.PushBufferMemoryBarrier(
+      edram_buffer_, 0, VK_WHOLE_SIZE, stage_mask, stage_mask, access_mask,
+      access_mask, VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED, false);
+  edram_buffer_modification_status_ =
+      EdramBufferModificationStatus::kUnmodified;
+  PixelShaderInterlockFullEdramBarrierPlaced();
 }
 
 const VulkanRenderTargetCache::Framebuffer*
@@ -646,8 +1342,15 @@ VulkanRenderTargetCache::GetFramebuffer(
     depth_and_color_rts_remaining &= ~(uint32_t(1) << rt_index);
     const auto& vulkan_rt = *static_cast<const VulkanRenderTarget*>(
         depth_and_color_render_targets[rt_index]);
-    attachments[attachment_count++] = rt_index ? vulkan_rt.view_depth_color()
-                                               : vulkan_rt.view_depth_stencil();
+    VkImageView attachment;
+    if (rt_index) {
+      attachment = render_pass_key.color_rts_use_transfer_formats
+                       ? vulkan_rt.view_color_transfer()
+                       : vulkan_rt.view_depth_color();
+    } else {
+      attachment = vulkan_rt.view_depth_stencil();
+    }
+    attachments[attachment_count++] = attachment;
   }
 
   VkFramebufferCreateInfo framebuffer_create_info;
@@ -684,6 +1387,3491 @@ VulkanRenderTargetCache::GetFramebuffer(
               .first->second;
 }
 
+VkShaderModule VulkanRenderTargetCache::GetTransferShader(
+    TransferShaderKey key) {
+  auto shader_it = transfer_shaders_.find(key);
+  if (shader_it != transfer_shaders_.end()) {
+    return shader_it->second;
+  }
+
+  const ui::vulkan::VulkanProvider& provider =
+      command_processor_.GetVulkanProvider();
+  const VkPhysicalDeviceFeatures& device_features = provider.device_features();
+
+  std::vector<spv::Id> id_vector_temp;
+  std::vector<unsigned int> uint_vector_temp;
+
+  spv::Builder builder(spv::Spv_1_0,
+                       (SpirvShaderTranslator::kSpirvMagicToolId << 16) | 1,
+                       nullptr);
+  spv::Id ext_inst_glsl_std_450 = builder.import("GLSL.std.450");
+  builder.addCapability(spv::CapabilityShader);
+  builder.setMemoryModel(spv::AddressingModelLogical, spv::MemoryModelGLSL450);
+  builder.setSource(spv::SourceLanguageUnknown, 0);
+
+  spv::Id type_void = builder.makeVoidType();
+  spv::Id type_bool = builder.makeBoolType();
+  spv::Id type_int = builder.makeIntType(32);
+  spv::Id type_int2 = builder.makeVectorType(type_int, 2);
+  spv::Id type_uint = builder.makeUintType(32);
+  spv::Id type_uint2 = builder.makeVectorType(type_uint, 2);
+  spv::Id type_uint4 = builder.makeVectorType(type_uint, 4);
+  spv::Id type_float = builder.makeFloatType(32);
+  spv::Id type_float2 = builder.makeVectorType(type_float, 2);
+  spv::Id type_float4 = builder.makeVectorType(type_float, 4);
+
+  const TransferModeInfo& mode = kTransferModes[size_t(key.mode)];
+  const TransferPipelineLayoutInfo& pipeline_layout_info =
+      kTransferPipelineLayoutInfos[size_t(mode.pipeline_layout)];
+
+  // If not dest_is_color, it's depth, or stencil bit - 40-sample columns are
+  // swapped as opposed to color source.
+  bool dest_is_color = (mode.output == TransferOutput::kColor);
+  xenos::ColorRenderTargetFormat dest_color_format =
+      xenos::ColorRenderTargetFormat(key.dest_resource_format);
+  xenos::DepthRenderTargetFormat dest_depth_format =
+      xenos::DepthRenderTargetFormat(key.dest_resource_format);
+  bool dest_is_64bpp =
+      dest_is_color && xenos::IsColorRenderTargetFormat64bpp(dest_color_format);
+
+  xenos::ColorRenderTargetFormat source_color_format =
+      xenos::ColorRenderTargetFormat(key.source_resource_format);
+  xenos::DepthRenderTargetFormat source_depth_format =
+      xenos::DepthRenderTargetFormat(key.source_resource_format);
+  // If not source_is_color, it's depth / stencil - 40-sample columns are
+  // swapped as opposed to color destination.
+  bool source_is_color = (pipeline_layout_info.used_descriptor_sets &
+                          kTransferUsedDescriptorSetColorTextureBit) != 0;
+  bool source_is_64bpp;
+  uint32_t source_color_format_component_count;
+  uint32_t source_color_texture_component_mask;
+  bool source_color_is_uint;
+  spv::Id source_color_component_type;
+  if (source_is_color) {
+    assert_zero(pipeline_layout_info.used_descriptor_sets &
+                kTransferUsedDescriptorSetDepthStencilTexturesBit);
+    source_is_64bpp =
+        xenos::IsColorRenderTargetFormat64bpp(source_color_format);
+    source_color_format_component_count =
+        xenos::GetColorRenderTargetFormatComponentCount(source_color_format);
+    if (mode.output == TransferOutput::kStencilBit) {
+      if (source_is_64bpp && !dest_is_64bpp) {
+        // Need one component, but choosing from the two 32bpp halves of the
+        // 64bpp sample.
+        source_color_texture_component_mask =
+            0b1 | (0b1 << (source_color_format_component_count >> 1));
+      } else {
+        // Red is at least 8 bits per component in all formats.
+        source_color_texture_component_mask = 0b1;
+      }
+    } else {
+      source_color_texture_component_mask =
+          (uint32_t(1) << source_color_format_component_count) - 1;
+    }
+    GetColorOwnershipTransferVulkanFormat(source_color_format,
+                                          &source_color_is_uint);
+    source_color_component_type = source_color_is_uint ? type_uint : type_float;
+  } else {
+    source_is_64bpp = false;
+    source_color_format_component_count = 0;
+    source_color_texture_component_mask = 0;
+    source_color_is_uint = false;
+    source_color_component_type = spv::NoType;
+  }
+
+  std::vector<spv::Id> main_interface;
+
+  // Outputs.
+  bool shader_uses_stencil_reference_output =
+      mode.output == TransferOutput::kDepth &&
+      provider.device_extensions().ext_shader_stencil_export;
+  bool dest_color_is_uint = false;
+  uint32_t dest_color_component_count = 0;
+  spv::Id type_fragment_data_component = spv::NoResult;
+  spv::Id type_fragment_data = spv::NoResult;
+  spv::Id output_fragment_data = spv::NoResult;
+  spv::Id output_fragment_depth = spv::NoResult;
+  spv::Id output_fragment_stencil_ref = spv::NoResult;
+  switch (mode.output) {
+    case TransferOutput::kColor:
+      GetColorOwnershipTransferVulkanFormat(dest_color_format,
+                                            &dest_color_is_uint);
+      dest_color_component_count =
+          xenos::GetColorRenderTargetFormatComponentCount(dest_color_format);
+      type_fragment_data_component =
+          dest_color_is_uint ? type_uint : type_float;
+      type_fragment_data =
+          dest_color_component_count > 1
+              ? builder.makeVectorType(type_fragment_data_component,
+                                       dest_color_component_count)
+              : type_fragment_data_component;
+      output_fragment_data = builder.createVariable(
+          spv::NoPrecision, spv::StorageClassOutput, type_fragment_data,
+          "xe_transfer_fragment_data");
+      builder.addDecoration(output_fragment_data, spv::DecorationLocation,
+                            key.dest_color_rt_index);
+      main_interface.push_back(output_fragment_data);
+      break;
+    case TransferOutput::kDepth:
+      output_fragment_depth =
+          builder.createVariable(spv::NoPrecision, spv::StorageClassOutput,
+                                 type_float, "gl_FragDepth");
+      builder.addDecoration(output_fragment_depth, spv::DecorationBuiltIn,
+                            spv::BuiltInFragDepth);
+      main_interface.push_back(output_fragment_depth);
+      if (shader_uses_stencil_reference_output) {
+        builder.addExtension("SPV_EXT_shader_stencil_export");
+        builder.addCapability(spv::CapabilityStencilExportEXT);
+        output_fragment_stencil_ref =
+            builder.createVariable(spv::NoPrecision, spv::StorageClassOutput,
+                                   type_int, "gl_FragStencilRefARB");
+        builder.addDecoration(output_fragment_stencil_ref,
+                              spv::DecorationBuiltIn,
+                              spv::BuiltInFragStencilRefEXT);
+        main_interface.push_back(output_fragment_stencil_ref);
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Bindings.
+  // Generating SPIR-V 1.0, no need to add bindings to the entry point's
+  // interface until SPIR-V 1.4.
+  // Color source.
+  bool source_is_multisampled =
+      key.source_msaa_samples != xenos::MsaaSamples::k1X;
+  spv::Id source_color_texture = spv::NoResult;
+  if (pipeline_layout_info.used_descriptor_sets &
+      kTransferUsedDescriptorSetColorTextureBit) {
+    source_color_texture = builder.createVariable(
+        spv::NoPrecision, spv::StorageClassUniformConstant,
+        builder.makeImageType(source_color_component_type, spv::Dim2D, false,
+                              false, source_is_multisampled, 1,
+                              spv::ImageFormatUnknown),
+        "xe_transfer_color");
+    builder.addDecoration(
+        source_color_texture, spv::DecorationDescriptorSet,
+        xe::bit_count(pipeline_layout_info.used_descriptor_sets &
+                      (kTransferUsedDescriptorSetColorTextureBit - 1)));
+    builder.addDecoration(source_color_texture, spv::DecorationBinding, 0);
+  }
+  // Depth / stencil source.
+  spv::Id source_depth_texture = spv::NoResult;
+  spv::Id source_stencil_texture = spv::NoResult;
+  if (pipeline_layout_info.used_descriptor_sets &
+      kTransferUsedDescriptorSetDepthStencilTexturesBit) {
+    uint32_t source_depth_stencil_descriptor_set =
+        xe::bit_count(pipeline_layout_info.used_descriptor_sets &
+                      (kTransferUsedDescriptorSetDepthStencilTexturesBit - 1));
+    // Using `depth == false` in makeImageType because comparisons are not
+    // required, and other values of `depth` are causing issues in drivers.
+    // https://github.com/microsoft/DirectXShaderCompiler/issues/1107
+    if (mode.output != TransferOutput::kStencilBit) {
+      source_depth_texture = builder.createVariable(
+          spv::NoPrecision, spv::StorageClassUniformConstant,
+          builder.makeImageType(type_float, spv::Dim2D, false, false,
+                                source_is_multisampled, 1,
+                                spv::ImageFormatUnknown),
+          "xe_transfer_depth");
+      builder.addDecoration(source_depth_texture, spv::DecorationDescriptorSet,
+                            source_depth_stencil_descriptor_set);
+      builder.addDecoration(source_depth_texture, spv::DecorationBinding, 0);
+    }
+    if (mode.output != TransferOutput::kDepth ||
+        shader_uses_stencil_reference_output) {
+      source_stencil_texture = builder.createVariable(
+          spv::NoPrecision, spv::StorageClassUniformConstant,
+          builder.makeImageType(type_uint, spv::Dim2D, false, false,
+                                source_is_multisampled, 1,
+                                spv::ImageFormatUnknown),
+          "xe_transfer_stencil");
+      builder.addDecoration(source_stencil_texture,
+                            spv::DecorationDescriptorSet,
+                            source_depth_stencil_descriptor_set);
+      builder.addDecoration(source_stencil_texture, spv::DecorationBinding, 1);
+    }
+  }
+  // Host depth source buffer.
+  spv::Id host_depth_source_buffer = spv::NoResult;
+  if (pipeline_layout_info.used_descriptor_sets &
+      kTransferUsedDescriptorSetHostDepthBufferBit) {
+    id_vector_temp.clear();
+    id_vector_temp.push_back(builder.makeRuntimeArray(type_uint));
+    // Storage buffers have std430 packing, no padding to 4-component vectors.
+    builder.addDecoration(id_vector_temp.back(), spv::DecorationArrayStride,
+                          sizeof(float));
+    spv::Id type_host_depth_source_buffer =
+        builder.makeStructType(id_vector_temp, "XeTransferHostDepthBuffer");
+    builder.addMemberName(type_host_depth_source_buffer, 0, "host_depth");
+    builder.addMemberDecoration(type_host_depth_source_buffer, 0,
+                                spv::DecorationNonWritable);
+    builder.addMemberDecoration(type_host_depth_source_buffer, 0,
+                                spv::DecorationOffset, 0);
+    // Block since SPIR-V 1.3, but since SPIR-V 1.0 is generated, it's
+    // BufferBlock.
+    builder.addDecoration(type_host_depth_source_buffer,
+                          spv::DecorationBufferBlock);
+    // StorageBuffer since SPIR-V 1.3, but since SPIR-V 1.0 is generated, it's
+    // Uniform.
+    host_depth_source_buffer = builder.createVariable(
+        spv::NoPrecision, spv::StorageClassUniform,
+        type_host_depth_source_buffer, "xe_transfer_host_depth_buffer");
+    builder.addDecoration(
+        host_depth_source_buffer, spv::DecorationDescriptorSet,
+        xe::bit_count(pipeline_layout_info.used_descriptor_sets &
+                      (kTransferUsedDescriptorSetHostDepthBufferBit - 1)));
+    builder.addDecoration(host_depth_source_buffer, spv::DecorationBinding, 0);
+  }
+  // Host depth source texture (the depth / stencil descriptor set is reused,
+  // but stencil is not needed).
+  spv::Id host_depth_source_texture = spv::NoResult;
+  if (pipeline_layout_info.used_descriptor_sets &
+      kTransferUsedDescriptorSetHostDepthStencilTexturesBit) {
+    host_depth_source_texture = builder.createVariable(
+        spv::NoPrecision, spv::StorageClassUniformConstant,
+        builder.makeImageType(
+            type_float, spv::Dim2D, false, false,
+            key.host_depth_source_msaa_samples != xenos::MsaaSamples::k1X, 1,
+            spv::ImageFormatUnknown),
+        "xe_transfer_host_depth");
+    builder.addDecoration(
+        host_depth_source_texture, spv::DecorationDescriptorSet,
+        xe::bit_count(
+            pipeline_layout_info.used_descriptor_sets &
+            (kTransferUsedDescriptorSetHostDepthStencilTexturesBit - 1)));
+    builder.addDecoration(host_depth_source_texture, spv::DecorationBinding, 0);
+  }
+  // Push constants.
+  id_vector_temp.clear();
+  uint32_t push_constants_member_host_depth_address = UINT32_MAX;
+  if (pipeline_layout_info.used_push_constant_dwords &
+      kTransferUsedPushConstantDwordHostDepthAddressBit) {
+    push_constants_member_host_depth_address = uint32_t(id_vector_temp.size());
+    id_vector_temp.push_back(type_uint);
+  }
+  uint32_t push_constants_member_address = UINT32_MAX;
+  if (pipeline_layout_info.used_push_constant_dwords &
+      kTransferUsedPushConstantDwordAddressBit) {
+    push_constants_member_address = uint32_t(id_vector_temp.size());
+    id_vector_temp.push_back(type_uint);
+  }
+  uint32_t push_constants_member_stencil_mask = UINT32_MAX;
+  if (pipeline_layout_info.used_push_constant_dwords &
+      kTransferUsedPushConstantDwordStencilMaskBit) {
+    push_constants_member_stencil_mask = uint32_t(id_vector_temp.size());
+    id_vector_temp.push_back(type_uint);
+  }
+  spv::Id push_constants = spv::NoResult;
+  if (!id_vector_temp.empty()) {
+    spv::Id type_push_constants =
+        builder.makeStructType(id_vector_temp, "XeTransferPushConstants");
+    if (pipeline_layout_info.used_push_constant_dwords &
+        kTransferUsedPushConstantDwordHostDepthAddressBit) {
+      assert_true(push_constants_member_host_depth_address != UINT32_MAX);
+      builder.addMemberName(type_push_constants,
+                            push_constants_member_host_depth_address,
+                            "host_depth_address");
+      builder.addMemberDecoration(
+          type_push_constants, push_constants_member_host_depth_address,
+          spv::DecorationOffset,
+          sizeof(uint32_t) *
+              xe::bit_count(
+                  pipeline_layout_info.used_push_constant_dwords &
+                  (kTransferUsedPushConstantDwordHostDepthAddressBit - 1)));
+    }
+    if (pipeline_layout_info.used_push_constant_dwords &
+        kTransferUsedPushConstantDwordAddressBit) {
+      assert_true(push_constants_member_address != UINT32_MAX);
+      builder.addMemberName(type_push_constants, push_constants_member_address,
+                            "address");
+      builder.addMemberDecoration(
+          type_push_constants, push_constants_member_address,
+          spv::DecorationOffset,
+          sizeof(uint32_t) *
+              xe::bit_count(pipeline_layout_info.used_push_constant_dwords &
+                            (kTransferUsedPushConstantDwordAddressBit - 1)));
+    }
+    if (pipeline_layout_info.used_push_constant_dwords &
+        kTransferUsedPushConstantDwordStencilMaskBit) {
+      assert_true(push_constants_member_stencil_mask != UINT32_MAX);
+      builder.addMemberName(type_push_constants,
+                            push_constants_member_stencil_mask, "stencil_mask");
+      builder.addMemberDecoration(
+          type_push_constants, push_constants_member_stencil_mask,
+          spv::DecorationOffset,
+          sizeof(uint32_t) *
+              xe::bit_count(
+                  pipeline_layout_info.used_push_constant_dwords &
+                  (kTransferUsedPushConstantDwordStencilMaskBit - 1)));
+    }
+    builder.addDecoration(type_push_constants, spv::DecorationBlock);
+    push_constants = builder.createVariable(
+        spv::NoPrecision, spv::StorageClassPushConstant, type_push_constants,
+        "xe_transfer_push_constants");
+  }
+
+  // Coordinate inputs.
+  spv::Id input_fragment_coord = builder.createVariable(
+      spv::NoPrecision, spv::StorageClassInput, type_float4, "gl_FragCoord");
+  builder.addDecoration(input_fragment_coord, spv::DecorationBuiltIn,
+                        spv::BuiltInFragCoord);
+  main_interface.push_back(input_fragment_coord);
+  spv::Id input_sample_id = spv::NoResult;
+  spv::Id spec_const_sample_id = spv::NoResult;
+  if (key.dest_msaa_samples != xenos::MsaaSamples::k1X) {
+    if (device_features.sampleRateShading) {
+      // One draw for all samples.
+      builder.addCapability(spv::CapabilitySampleRateShading);
+      input_sample_id = builder.createVariable(
+          spv::NoPrecision, spv::StorageClassInput, type_int, "gl_SampleID");
+      builder.addDecoration(input_sample_id, spv::DecorationFlat);
+      builder.addDecoration(input_sample_id, spv::DecorationBuiltIn,
+                            spv::BuiltInSampleId);
+      main_interface.push_back(input_sample_id);
+    } else {
+      // One sample per draw, with different sample masks.
+      spec_const_sample_id = builder.makeUintConstant(0, true);
+      builder.addName(spec_const_sample_id, "xe_transfer_sample_id");
+      builder.addDecoration(spec_const_sample_id, spv::DecorationSpecId, 0);
+    }
+  }
+
+  // Begin the main function.
+  std::vector<spv::Id> main_param_types;
+  std::vector<std::vector<spv::Decoration>> main_precisions;
+  spv::Block* main_entry;
+  spv::Function* main_function =
+      builder.makeFunctionEntry(spv::NoPrecision, type_void, "main",
+                                main_param_types, main_precisions, &main_entry);
+
+  // Working with unsigned numbers for simplicity now, bitcasting to signed will
+  // be done at texture fetch.
+
+  uint32_t tile_width_samples_scaled =
+      xenos::kEdramTileWidthSamples * resolution_scale_x_;
+  uint32_t tile_height_samples_scaled =
+      xenos::kEdramTileHeightSamples * resolution_scale_y_;
+
+  // Convert the fragment coordinates to uint2.
+  uint_vector_temp.clear();
+  uint_vector_temp.reserve(2);
+  uint_vector_temp.push_back(0);
+  uint_vector_temp.push_back(1);
+  spv::Id dest_pixel_coord = builder.createUnaryOp(
+      spv::OpConvertFToU, type_uint2,
+      builder.createRvalueSwizzle(
+          spv::NoPrecision, type_float2,
+          builder.createLoad(input_fragment_coord, spv::NoPrecision),
+          uint_vector_temp));
+
+  // Prove to the AMD compiler that 24*24 multiplication can be done. 16 bits
+  // are more than enough for coordinates even with 3x resolution scaling (and
+  // Direct3D 11 hardware has 16.8 fixed-point coordinates).
+  // TODO(Triang3l): OpUnreachable if the coordinates have upper bits set.
+
+  // Split the destination pixel coordinate into scalars.
+  spv::Id dest_pixel_x =
+      builder.createCompositeExtract(dest_pixel_coord, type_uint, 0);
+  spv::Id dest_pixel_y =
+      builder.createCompositeExtract(dest_pixel_coord, type_uint, 1);
+
+  // Split the destination pixel index into 32bpp tile and 32bpp-tile-relative
+  // pixel index.
+  uint32_t dest_sample_width_log2 =
+      uint32_t(dest_is_64bpp) +
+      uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X);
+  uint32_t dest_sample_height_log2 =
+      uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X);
+  uint32_t dest_tile_width_divide_scale, dest_tile_width_divide_shift;
+  draw_util::GetEdramTileWidthDivideScaleAndUpperShift(
+      resolution_scale_x_, dest_tile_width_divide_scale,
+      dest_tile_width_divide_shift);
+  // Doing 16*16=32 multiplication, not 32*32=64.
+  // TODO(Triang3l): Abstract this away, don't do 32*32 on Direct3D 12 too.
+  dest_tile_width_divide_scale &= UINT16_MAX;
+  dest_tile_width_divide_shift += 16;
+  // Need the host tile size in pixels, not samples.
+  dest_tile_width_divide_shift -= dest_sample_width_log2;
+  spv::Id dest_tile_index_x = builder.createBinOp(
+      spv::OpShiftRightLogical, type_uint,
+      builder.createBinOp(
+          spv::OpIMul, type_uint, dest_pixel_x,
+          builder.makeUintConstant(dest_tile_width_divide_scale)),
+      builder.makeUintConstant(dest_tile_width_divide_shift));
+  spv::Id dest_tile_pixel_x = builder.createBinOp(
+      spv::OpISub, type_uint, dest_pixel_x,
+      builder.createBinOp(spv::OpIMul, type_uint, dest_tile_index_x,
+                          builder.makeUintConstant(tile_width_samples_scaled >>
+                                                   dest_sample_width_log2)));
+  spv::Id dest_tile_index_y, dest_tile_pixel_y;
+  if (resolution_scale_y_ == 3) {
+    dest_tile_index_y = builder.createBinOp(
+        spv::OpShiftRightLogical, type_uint,
+        builder.createBinOp(
+            spv::OpIMul, type_uint, dest_pixel_y,
+            builder.makeUintConstant(draw_util::kDivideScale3 & UINT16_MAX)),
+        builder.makeUintConstant(draw_util::kDivideUpperShift3 + 16 + 4 -
+                                 dest_sample_height_log2));
+    dest_tile_pixel_y = builder.createBinOp(
+        spv::OpISub, type_uint, dest_pixel_y,
+        builder.createBinOp(
+            spv::OpIMul, type_uint, dest_tile_index_y,
+            builder.makeUintConstant(tile_height_samples_scaled >>
+                                     dest_sample_height_log2)));
+  } else {
+    assert_true(resolution_scale_y_ <= 2);
+    uint32_t dest_tile_height_pixels_log2 =
+        (resolution_scale_y_ == 2 ? 5 : 4) - dest_sample_height_log2;
+    dest_tile_index_y = builder.createBinOp(
+        spv::OpShiftRightLogical, type_uint, dest_pixel_y,
+        builder.makeUintConstant(dest_tile_height_pixels_log2));
+    dest_tile_pixel_y = builder.createBinOp(
+        spv::OpBitwiseAnd, type_uint, dest_pixel_y,
+        builder.makeUintConstant((uint32_t(1) << dest_tile_height_pixels_log2) -
+                                 1));
+  }
+
+  assert_true(push_constants_member_address != UINT32_MAX);
+  id_vector_temp.clear();
+  id_vector_temp.push_back(
+      builder.makeIntConstant(int32_t(push_constants_member_address)));
+  spv::Id address_constant = builder.createLoad(
+      builder.createAccessChain(spv::StorageClassPushConstant, push_constants,
+                                id_vector_temp),
+      spv::NoPrecision);
+
+  // Calculate the 32bpp tile index from its X and Y parts.
+  spv::Id dest_tile_index = builder.createBinOp(
+      spv::OpIAdd, type_uint,
+      builder.createBinOp(
+          spv::OpIMul, type_uint,
+          builder.createTriOp(
+              spv::OpBitFieldUExtract, type_uint, address_constant,
+              builder.makeUintConstant(0),
+              builder.makeUintConstant(xenos::kEdramPitchTilesBits)),
+          dest_tile_index_y),
+      dest_tile_index_x);
+
+  // Load the destination sample index.
+  spv::Id dest_sample_id = spv::NoResult;
+  if (key.dest_msaa_samples != xenos::MsaaSamples::k1X) {
+    if (device_features.sampleRateShading) {
+      assert_true(input_sample_id != spv::NoResult);
+      dest_sample_id = builder.createUnaryOp(
+          spv::OpBitcast, type_uint,
+          builder.createLoad(input_sample_id, spv::NoPrecision));
+    } else {
+      assert_true(spec_const_sample_id != spv::NoResult);
+      // Already uint.
+      dest_sample_id = spec_const_sample_id;
+    }
+  }
+
+  // Transform the destination framebuffer pixel and sample coordinates into the
+  // source texture pixel and sample coordinates.
+
+  // First sample bit at 4x with Vulkan standard locations - horizontal sample.
+  // Second sample bit at 4x with Vulkan standard locations - vertical sample.
+  // At 2x:
+  // - Native 2x: top is 1 in Vulkan, bottom is 0.
+  // - 2x as 4x: top is 0, bottom is 3.
+
+  spv::Id source_sample_id = dest_sample_id;
+  spv::Id source_tile_pixel_x = dest_tile_pixel_x;
+  spv::Id source_tile_pixel_y = dest_tile_pixel_y;
+  spv::Id source_color_half = spv::NoResult;
+  if (!source_is_64bpp && dest_is_64bpp) {
+    // 32bpp -> 64bpp, need two samples of the source.
+    if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) {
+      // 32bpp -> 64bpp, 4x ->.
+      // Source has 32bpp halves in two adjacent samples.
+      if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) {
+        // 32bpp -> 64bpp, 4x -> 4x.
+        // 1 destination horizontal sample = 2 source horizontal samples.
+        // D p0,0 s0,0 = S p0,0 s0,0 | S p0,0 s1,0
+        // D p0,0 s1,0 = S p1,0 s0,0 | S p1,0 s1,0
+        // D p0,0 s0,1 = S p0,0 s0,1 | S p0,0 s1,1
+        // D p0,0 s1,1 = S p1,0 s0,1 | S p1,0 s1,1
+        // Thus destination horizontal sample -> source horizontal pixel,
+        // vertical samples are 1:1.
+        source_sample_id =
+            builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_sample_id,
+                                builder.makeUintConstant(1 << 1));
+        id_vector_temp.clear();
+        id_vector_temp.reserve(4);
+        id_vector_temp.push_back(dest_sample_id);
+        id_vector_temp.push_back(dest_tile_pixel_x);
+        id_vector_temp.push_back(builder.makeUintConstant(1));
+        id_vector_temp.push_back(builder.makeUintConstant(31));
+        source_tile_pixel_x =
+            builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp);
+      } else if (key.dest_msaa_samples == xenos::MsaaSamples::k2X) {
+        // 32bpp -> 64bpp, 4x -> 2x.
+        // 1 destination horizontal pixel = 2 source horizontal samples.
+        // D p0,0 s0 = S p0,0 s0,0 | S p0,0 s1,0
+        // D p0,0 s1 = S p0,0 s0,1 | S p0,0 s1,1
+        // D p1,0 s0 = S p1,0 s0,0 | S p1,0 s1,0
+        // D p1,0 s1 = S p1,0 s0,1 | S p1,0 s1,1
+        // Pixel index can be reused. Sample 1 (for native 2x) or 0 (for 2x as
+        // 4x) should become samples 01, sample 0 or 3 should become samples 23.
+        if (msaa_2x_attachments_supported_) {
+          source_sample_id = builder.createBinOp(
+              spv::OpShiftLeftLogical, type_uint,
+              builder.createBinOp(spv::OpBitwiseXor, type_uint, dest_sample_id,
+                                  builder.makeUintConstant(1)),
+              builder.makeUintConstant(1));
+        } else {
+          source_sample_id =
+              builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_sample_id,
+                                  builder.makeUintConstant(1 << 1));
+        }
+      } else {
+        // 32bpp -> 64bpp, 4x -> 1x.
+        // 1 destination horizontal pixel = 2 source horizontal samples.
+        // D p0,0 = S p0,0 s0,0 | S p0,0 s1,0
+        // D p0,1 = S p0,0 s0,1 | S p0,0 s1,1
+        // Horizontal pixel index can be reused. Vertical pixel 1 should
+        // become sample 2.
+        id_vector_temp.clear();
+        id_vector_temp.reserve(4);
+        id_vector_temp.push_back(builder.makeUintConstant(0));
+        id_vector_temp.push_back(dest_tile_pixel_y);
+        id_vector_temp.push_back(builder.makeUintConstant(1));
+        id_vector_temp.push_back(builder.makeUintConstant(1));
+        source_sample_id =
+            builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp);
+        source_tile_pixel_y =
+            builder.createBinOp(spv::OpShiftRightLogical, type_uint,
+                                dest_tile_pixel_y, builder.makeUintConstant(1));
+      }
+    } else {
+      // 32bpp -> 64bpp, 1x/2x ->.
+      // Source has 32bpp halves in two adjacent pixels.
+      if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) {
+        // 32bpp -> 64bpp, 1x/2x -> 4x.
+        // The X part.
+        // 1 destination horizontal sample = 2 source horizontal pixels.
+        id_vector_temp.clear();
+        id_vector_temp.reserve(4);
+        id_vector_temp.push_back(builder.createBinOp(
+            spv::OpShiftLeftLogical, type_uint, dest_tile_pixel_x,
+            builder.makeUintConstant(2)));
+        id_vector_temp.push_back(dest_sample_id);
+        id_vector_temp.push_back(builder.makeUintConstant(1));
+        id_vector_temp.push_back(builder.makeUintConstant(1));
+        source_tile_pixel_x =
+            builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp);
+        // Y is handled by common code.
+      } else {
+        // 32bpp -> 64bpp, 1x/2x -> 1x/2x.
+        // The X part.
+        // 1 destination horizontal pixel = 2 source horizontal pixels.
+        source_tile_pixel_x =
+            builder.createBinOp(spv::OpShiftLeftLogical, type_uint,
+                                dest_tile_pixel_x, builder.makeUintConstant(1));
+        // Y is handled by common code.
+      }
+    }
+  } else if (source_is_64bpp && !dest_is_64bpp) {
+    // 64bpp -> 32bpp, also the half to load.
+    if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) {
+      // 64bpp -> 32bpp, -> 4x.
+      // The needed half is in the destination horizontal sample index.
+      if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) {
+        // 64bpp -> 32bpp, 4x -> 4x.
+        // D p0,0 s0,0 = S s0,0 low
+        // D p0,0 s1,0 = S s0,0 high
+        // D p1,0 s0,0 = S s1,0 low
+        // D p1,0 s1,0 = S s1,0 high
+        // Vertical pixel and sample (second bit) addressing is the same.
+        // However, 1 horizontal destination pixel = 1 horizontal source sample.
+        id_vector_temp.clear();
+        id_vector_temp.reserve(4);
+        id_vector_temp.push_back(dest_sample_id);
+        id_vector_temp.push_back(dest_tile_pixel_x);
+        id_vector_temp.push_back(builder.makeUintConstant(0));
+        id_vector_temp.push_back(builder.makeUintConstant(1));
+        source_sample_id =
+            builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp);
+        // 2 destination horizontal samples = 1 source horizontal sample, thus
+        // 2 destination horizontal pixels = 1 source horizontal pixel.
+        source_tile_pixel_x =
+            builder.createBinOp(spv::OpShiftRightLogical, type_uint,
+                                dest_tile_pixel_x, builder.makeUintConstant(1));
+      } else {
+        // 64bpp -> 32bpp, 1x/2x -> 4x.
+        // 2 destination horizontal samples = 1 source horizontal pixel, thus
+        // 1 destination horizontal pixel = 1 source horizontal pixel. Can reuse
+        // horizontal pixel index.
+        // Y is handled by common code.
+      }
+      // Half from the destination horizontal sample index.
+      source_color_half =
+          builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_sample_id,
+                              builder.makeUintConstant(1));
+    } else {
+      // 64bpp -> 32bpp, -> 1x/2x.
+      // The needed half is in the destination horizontal pixel index.
+      if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) {
+        // 64bpp -> 32bpp, 4x -> 1x/2x.
+        // (Destination horizontal pixel >> 1) & 1 = source horizontal sample
+        // (first bit).
+        source_sample_id = builder.createTriOp(
+            spv::OpBitFieldUExtract, type_uint, dest_tile_pixel_x,
+            builder.makeUintConstant(1), builder.makeUintConstant(1));
+        if (key.dest_msaa_samples == xenos::MsaaSamples::k2X) {
+          // 64bpp -> 32bpp, 4x -> 2x.
+          // Destination vertical samples (1/0 in the first bit for native 2x or
+          // 0/1 in the second bit for 2x as 4x) = source vertical samples
+          // (second bit).
+          if (msaa_2x_attachments_supported_) {
+            id_vector_temp.clear();
+            id_vector_temp.reserve(4);
+            id_vector_temp.push_back(source_sample_id);
+            id_vector_temp.push_back(builder.createBinOp(
+                spv::OpBitwiseXor, type_uint, dest_sample_id,
+                builder.makeUintConstant(1)));
+            id_vector_temp.push_back(builder.makeUintConstant(1));
+            id_vector_temp.push_back(builder.makeUintConstant(1));
+            source_sample_id = builder.createOp(spv::OpBitFieldInsert,
+                                                type_uint, id_vector_temp);
+          } else {
+            id_vector_temp.clear();
+            id_vector_temp.reserve(4);
+            id_vector_temp.push_back(dest_sample_id);
+            id_vector_temp.push_back(source_sample_id);
+            id_vector_temp.push_back(builder.makeUintConstant(0));
+            id_vector_temp.push_back(builder.makeUintConstant(1));
+            source_sample_id = builder.createOp(spv::OpBitFieldInsert,
+                                                type_uint, id_vector_temp);
+          }
+        } else {
+          // 64bpp -> 32bpp, 4x -> 1x.
+          // 1 destination vertical pixel = 1 source vertical sample.
+          id_vector_temp.clear();
+          id_vector_temp.reserve(4);
+          id_vector_temp.push_back(source_sample_id);
+          id_vector_temp.push_back(source_tile_pixel_y);
+          id_vector_temp.push_back(builder.makeUintConstant(1));
+          id_vector_temp.push_back(builder.makeUintConstant(1));
+          source_sample_id = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                              id_vector_temp);
+          source_tile_pixel_y = builder.createBinOp(
+              spv::OpShiftRightLogical, type_uint, dest_tile_pixel_y,
+              builder.makeUintConstant(1));
+        }
+        // 2 destination horizontal pixels = 1 source horizontal sample.
+        // 4 destination horizontal pixels = 1 source horizontal pixel.
+        source_tile_pixel_x =
+            builder.createBinOp(spv::OpShiftRightLogical, type_uint,
+                                dest_tile_pixel_x, builder.makeUintConstant(2));
+      } else {
+        // 64bpp -> 32bpp, 1x/2x -> 1x/2x.
+        // The X part.
+        // 2 destination horizontal pixels = 1 destination source pixel.
+        source_tile_pixel_x =
+            builder.createBinOp(spv::OpShiftRightLogical, type_uint,
+                                dest_tile_pixel_x, builder.makeUintConstant(1));
+        // Y is handled by common code.
+      }
+      // Half from the destination horizontal pixel index.
+      source_color_half =
+          builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_tile_pixel_x,
+                              builder.makeUintConstant(1));
+    }
+    assert_true(source_color_half != spv::NoResult);
+  } else {
+    // Same bit count.
+    if (key.source_msaa_samples != key.dest_msaa_samples) {
+      if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) {
+        // Same BPP, 4x -> 1x/2x.
+        if (key.dest_msaa_samples == xenos::MsaaSamples::k2X) {
+          // Same BPP, 4x -> 2x.
+          // Horizontal pixels to samples. Vertical sample (1/0 in the first bit
+          // for native 2x or 0/1 in the second bit for 2x as 4x) to second
+          // sample bit.
+          if (msaa_2x_attachments_supported_) {
+            id_vector_temp.clear();
+            id_vector_temp.reserve(4);
+            id_vector_temp.push_back(dest_tile_pixel_x);
+            id_vector_temp.push_back(builder.createBinOp(
+                spv::OpBitwiseXor, type_uint, dest_sample_id,
+                builder.makeUintConstant(1)));
+            id_vector_temp.push_back(builder.makeUintConstant(1));
+            id_vector_temp.push_back(builder.makeUintConstant(31));
+            source_sample_id = builder.createOp(spv::OpBitFieldInsert,
+                                                type_uint, id_vector_temp);
+          } else {
+            id_vector_temp.clear();
+            id_vector_temp.reserve(4);
+            id_vector_temp.push_back(dest_sample_id);
+            id_vector_temp.push_back(dest_tile_pixel_x);
+            id_vector_temp.push_back(builder.makeUintConstant(0));
+            id_vector_temp.push_back(builder.makeUintConstant(1));
+            source_sample_id = builder.createOp(spv::OpBitFieldInsert,
+                                                type_uint, id_vector_temp);
+          }
+          source_tile_pixel_x = builder.createBinOp(
+              spv::OpShiftRightLogical, type_uint, dest_tile_pixel_x,
+              builder.makeUintConstant(1));
+        } else {
+          // Same BPP, 4x -> 1x.
+          // Pixels to samples.
+          id_vector_temp.clear();
+          id_vector_temp.reserve(4);
+          id_vector_temp.push_back(builder.createBinOp(
+              spv::OpBitwiseAnd, type_uint, dest_tile_pixel_x,
+              builder.makeUintConstant(1)));
+          id_vector_temp.push_back(dest_tile_pixel_y);
+          id_vector_temp.push_back(builder.makeUintConstant(1));
+          id_vector_temp.push_back(builder.makeUintConstant(1));
+          source_sample_id = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                              id_vector_temp);
+          source_tile_pixel_x = builder.createBinOp(
+              spv::OpShiftRightLogical, type_uint, dest_tile_pixel_x,
+              builder.makeUintConstant(1));
+          source_tile_pixel_y = builder.createBinOp(
+              spv::OpShiftRightLogical, type_uint, dest_tile_pixel_y,
+              builder.makeUintConstant(1));
+        }
+      } else {
+        // Same BPP, 1x/2x -> 1x/2x/4x (as long as they're different).
+        // Only the X part - Y is handled by common code.
+        if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) {
+          // Horizontal samples to pixels.
+          id_vector_temp.clear();
+          id_vector_temp.reserve(4);
+          id_vector_temp.push_back(dest_sample_id);
+          id_vector_temp.push_back(dest_tile_pixel_x);
+          id_vector_temp.push_back(builder.makeUintConstant(1));
+          id_vector_temp.push_back(builder.makeUintConstant(31));
+          source_tile_pixel_x = builder.createOp(spv::OpBitFieldInsert,
+                                                 type_uint, id_vector_temp);
+        }
+      }
+    }
+  }
+  // Common source Y and sample index for 1x/2x AA sources, independent of bits
+  // per sample.
+  if (key.source_msaa_samples < xenos::MsaaSamples::k4X &&
+      key.source_msaa_samples != key.dest_msaa_samples) {
+    if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) {
+      // 1x/2x -> 4x.
+      if (key.source_msaa_samples == xenos::MsaaSamples::k2X) {
+        // 2x -> 4x.
+        // Vertical samples (second bit) of 4x destination to vertical sample
+        // (1, 0 for native 2x, or 0, 3 for 2x as 4x) of 2x source.
+        source_sample_id =
+            builder.createBinOp(spv::OpShiftRightLogical, type_uint,
+                                dest_sample_id, builder.makeUintConstant(1));
+        if (msaa_2x_attachments_supported_) {
+          source_sample_id = builder.createBinOp(spv::OpBitwiseXor, type_uint,
+                                                 source_sample_id,
+                                                 builder.makeUintConstant(1));
+        } else {
+          id_vector_temp.clear();
+          id_vector_temp.reserve(4);
+          id_vector_temp.push_back(source_sample_id);
+          id_vector_temp.push_back(source_sample_id);
+          id_vector_temp.push_back(builder.makeUintConstant(1));
+          id_vector_temp.push_back(builder.makeUintConstant(1));
+          source_sample_id = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                              id_vector_temp);
+        }
+      } else {
+        // 1x -> 4x.
+        // Vertical samples (second bit) to Y pixels.
+        id_vector_temp.clear();
+        id_vector_temp.reserve(4);
+        id_vector_temp.push_back(
+            builder.createBinOp(spv::OpShiftRightLogical, type_uint,
+                                dest_sample_id, builder.makeUintConstant(1)));
+        id_vector_temp.push_back(dest_tile_pixel_y);
+        id_vector_temp.push_back(builder.makeUintConstant(1));
+        id_vector_temp.push_back(builder.makeUintConstant(31));
+        source_tile_pixel_y =
+            builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp);
+      }
+    } else {
+      // 1x/2x -> different 1x/2x.
+      if (key.source_msaa_samples == xenos::MsaaSamples::k2X) {
+        // 2x -> 1x.
+        // Vertical pixels of 2x destination to vertical samples (1, 0 for
+        // native 2x, or 0, 3 for 2x as 4x) of 1x source.
+        source_sample_id =
+            builder.createBinOp(spv::OpBitwiseAnd, type_uint, dest_tile_pixel_y,
+                                builder.makeUintConstant(1));
+        if (msaa_2x_attachments_supported_) {
+          source_sample_id = builder.createBinOp(spv::OpBitwiseXor, type_uint,
+                                                 source_sample_id,
+                                                 builder.makeUintConstant(1));
+        } else {
+          id_vector_temp.clear();
+          id_vector_temp.reserve(4);
+          id_vector_temp.push_back(source_sample_id);
+          id_vector_temp.push_back(source_sample_id);
+          id_vector_temp.push_back(builder.makeUintConstant(1));
+          id_vector_temp.push_back(builder.makeUintConstant(1));
+          source_sample_id = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                              id_vector_temp);
+        }
+        source_tile_pixel_y =
+            builder.createBinOp(spv::OpShiftRightLogical, type_uint,
+                                dest_tile_pixel_y, builder.makeUintConstant(1));
+      } else {
+        // 1x -> 2x.
+        // Vertical samples (1/0 in the first bit for native 2x or 0/1 in the
+        // second bit for 2x as 4x) of 2x destination to vertical pixels of 1x
+        // source.
+        if (msaa_2x_attachments_supported_) {
+          id_vector_temp.clear();
+          id_vector_temp.reserve(4);
+          id_vector_temp.push_back(
+              builder.createBinOp(spv::OpBitwiseXor, type_uint, dest_sample_id,
+                                  builder.makeUintConstant(1)));
+          id_vector_temp.push_back(dest_tile_pixel_y);
+          id_vector_temp.push_back(builder.makeUintConstant(1));
+          id_vector_temp.push_back(builder.makeUintConstant(31));
+          source_tile_pixel_y = builder.createOp(spv::OpBitFieldInsert,
+                                                 type_uint, id_vector_temp);
+        } else {
+          id_vector_temp.clear();
+          id_vector_temp.reserve(4);
+          id_vector_temp.push_back(
+              builder.createBinOp(spv::OpShiftRightLogical, type_uint,
+                                  dest_sample_id, builder.makeUintConstant(1)));
+          id_vector_temp.push_back(dest_tile_pixel_y);
+          id_vector_temp.push_back(builder.makeUintConstant(1));
+          id_vector_temp.push_back(builder.makeUintConstant(31));
+          source_tile_pixel_y = builder.createOp(spv::OpBitFieldInsert,
+                                                 type_uint, id_vector_temp);
+        }
+      }
+    }
+  }
+
+  uint32_t source_pixel_width_dwords_log2 =
+      uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k4X) +
+      uint32_t(source_is_64bpp);
+
+  if (source_is_color != dest_is_color) {
+    // Copying between color and depth / stencil - swap 40-32bpp-sample columns
+    // in the pixel index within the source 32bpp tile.
+    uint32_t source_32bpp_tile_half_pixels =
+        tile_width_samples_scaled >> (1 + source_pixel_width_dwords_log2);
+    source_tile_pixel_x = builder.createUnaryOp(
+        spv::OpBitcast, type_uint,
+        builder.createBinOp(
+            spv::OpIAdd, type_int,
+            builder.createUnaryOp(spv::OpBitcast, type_int,
+                                  source_tile_pixel_x),
+            builder.createTriOp(
+                spv::OpSelect, type_int,
+                builder.createBinOp(
+                    spv::OpULessThan, builder.makeBoolType(),
+                    source_tile_pixel_x,
+                    builder.makeUintConstant(source_32bpp_tile_half_pixels)),
+                builder.makeIntConstant(int32_t(source_32bpp_tile_half_pixels)),
+                builder.makeIntConstant(
+                    -int32_t(source_32bpp_tile_half_pixels)))));
+  }
+
+  // Transform the destination 32bpp tile index into the source.
+  spv::Id source_tile_index = builder.createUnaryOp(
+      spv::OpBitcast, type_uint,
+      builder.createBinOp(
+          spv::OpIAdd, type_int,
+          builder.createUnaryOp(spv::OpBitcast, type_int, dest_tile_index),
+          builder.createTriOp(
+              spv::OpBitFieldSExtract, type_int,
+              builder.createUnaryOp(spv::OpBitcast, type_int, address_constant),
+              builder.makeUintConstant(xenos::kEdramPitchTilesBits * 2),
+              builder.makeUintConstant(xenos::kEdramBaseTilesBits))));
+  // Split the source 32bpp tile index into X and Y tile index within the source
+  // image.
+  spv::Id source_pitch_tiles = builder.createTriOp(
+      spv::OpBitFieldUExtract, type_uint, address_constant,
+      builder.makeUintConstant(xenos::kEdramPitchTilesBits),
+      builder.makeUintConstant(xenos::kEdramPitchTilesBits));
+  spv::Id source_tile_index_y = builder.createBinOp(
+      spv::OpUDiv, type_uint, source_tile_index, source_pitch_tiles);
+  spv::Id source_tile_index_x = builder.createBinOp(
+      spv::OpUMod, type_uint, source_tile_index, source_pitch_tiles);
+  // Finally calculate the source texture coordinates.
+  spv::Id source_pixel_x_int = builder.createUnaryOp(
+      spv::OpBitcast, type_int,
+      builder.createBinOp(
+          spv::OpIAdd, type_uint,
+          builder.createBinOp(
+              spv::OpIMul, type_uint,
+              builder.makeUintConstant(tile_width_samples_scaled >>
+                                       source_pixel_width_dwords_log2),
+              source_tile_index_x),
+          source_tile_pixel_x));
+  spv::Id source_pixel_y_int = builder.createUnaryOp(
+      spv::OpBitcast, type_int,
+      builder.createBinOp(
+          spv::OpIAdd, type_uint,
+          builder.createBinOp(
+              spv::OpIMul, type_uint,
+              builder.makeUintConstant(
+                  tile_height_samples_scaled >>
+                  uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k2X)),
+              source_tile_index_y),
+          source_tile_pixel_y));
+
+  // Load the source.
+
+  spv::Builder::TextureParameters source_texture_parameters = {};
+  id_vector_temp.clear();
+  id_vector_temp.reserve(2);
+  id_vector_temp.push_back(source_pixel_x_int);
+  id_vector_temp.push_back(source_pixel_y_int);
+  spv::Id source_coordinates[2] = {
+      builder.createCompositeConstruct(type_int2, id_vector_temp),
+  };
+  spv::Id source_sample_ids_int[2] = {};
+  if (key.source_msaa_samples != xenos::MsaaSamples::k1X) {
+    source_sample_ids_int[0] =
+        builder.createUnaryOp(spv::OpBitcast, type_int, source_sample_id);
+  } else {
+    source_texture_parameters.lod = builder.makeIntConstant(0);
+  }
+  // Go to the next sample or pixel along X if need to load two dwords.
+  bool source_load_is_two_32bpp_samples = !source_is_64bpp && dest_is_64bpp;
+  if (source_load_is_two_32bpp_samples) {
+    if (key.source_msaa_samples >= xenos::MsaaSamples::k4X) {
+      source_coordinates[1] = source_coordinates[0];
+      source_sample_ids_int[1] = builder.createBinOp(
+          spv::OpBitwiseOr, type_int, source_sample_ids_int[0],
+          builder.makeIntConstant(1));
+    } else {
+      id_vector_temp.clear();
+      id_vector_temp.reserve(2);
+      id_vector_temp.push_back(builder.createBinOp(spv::OpBitwiseOr, type_int,
+                                                   source_pixel_x_int,
+                                                   builder.makeIntConstant(1)));
+      id_vector_temp.push_back(source_pixel_y_int);
+      source_coordinates[1] =
+          builder.createCompositeConstruct(type_int2, id_vector_temp);
+      source_sample_ids_int[1] = source_sample_ids_int[0];
+    }
+  }
+  spv::Id source_color[2][4] = {};
+  if (source_color_texture != spv::NoResult) {
+    source_texture_parameters.sampler =
+        builder.createLoad(source_color_texture, spv::NoPrecision);
+    assert_true(source_color_component_type != spv::NoType);
+    spv::Id source_color_vec4_type =
+        builder.makeVectorType(source_color_component_type, 4);
+    for (uint32_t i = 0; i <= uint32_t(source_load_is_two_32bpp_samples); ++i) {
+      source_texture_parameters.coords = source_coordinates[i];
+      source_texture_parameters.sample = source_sample_ids_int[i];
+      spv::Id source_color_vec4 = builder.createTextureCall(
+          spv::NoPrecision, source_color_vec4_type, false, true, false, false,
+          false, source_texture_parameters, spv::ImageOperandsMaskNone);
+      uint32_t source_color_components_remaining =
+          source_color_texture_component_mask;
+      uint32_t source_color_component_index;
+      while (xe::bit_scan_forward(source_color_components_remaining,
+                                  &source_color_component_index)) {
+        source_color_components_remaining &=
+            ~(uint32_t(1) << source_color_component_index);
+        source_color[i][source_color_component_index] =
+            builder.createCompositeExtract(source_color_vec4,
+                                           source_color_component_type,
+                                           source_color_component_index);
+      }
+    }
+  }
+  spv::Id source_depth_float[2] = {};
+  if (source_depth_texture != spv::NoResult) {
+    source_texture_parameters.sampler =
+        builder.createLoad(source_depth_texture, spv::NoPrecision);
+    for (uint32_t i = 0; i <= uint32_t(source_load_is_two_32bpp_samples); ++i) {
+      source_texture_parameters.coords = source_coordinates[i];
+      source_texture_parameters.sample = source_sample_ids_int[i];
+      source_depth_float[i] = builder.createCompositeExtract(
+          builder.createTextureCall(
+              spv::NoPrecision, type_float4, false, true, false, false, false,
+              source_texture_parameters, spv::ImageOperandsMaskNone),
+          type_float, 0);
+    }
+  }
+  spv::Id source_stencil[2] = {};
+  if (source_stencil_texture != spv::NoResult) {
+    source_texture_parameters.sampler =
+        builder.createLoad(source_stencil_texture, spv::NoPrecision);
+    for (uint32_t i = 0; i <= uint32_t(source_load_is_two_32bpp_samples); ++i) {
+      source_texture_parameters.coords = source_coordinates[i];
+      source_texture_parameters.sample = source_sample_ids_int[i];
+      source_stencil[i] = builder.createCompositeExtract(
+          builder.createTextureCall(
+              spv::NoPrecision, type_uint4, false, true, false, false, false,
+              source_texture_parameters, spv::ImageOperandsMaskNone),
+          type_uint, 0);
+    }
+  }
+
+  // Pick the needed 32bpp half of the 64bpp color.
+  if (source_is_64bpp && !dest_is_64bpp) {
+    uint32_t source_color_half_component_count =
+        source_color_format_component_count >> 1;
+    assert_true(source_color_half != spv::NoResult);
+    spv::Id source_color_is_second_half =
+        builder.createBinOp(spv::OpINotEqual, type_bool, source_color_half,
+                            builder.makeUintConstant(0));
+    if (mode.output == TransferOutput::kStencilBit) {
+      source_color[0][0] = builder.createTriOp(
+          spv::OpSelect, source_color_component_type,
+          source_color_is_second_half,
+          source_color[0][source_color_half_component_count],
+          source_color[0][0]);
+    } else {
+      for (uint32_t i = 0; i < source_color_half_component_count; ++i) {
+        source_color[0][i] = builder.createTriOp(
+            spv::OpSelect, source_color_component_type,
+            source_color_is_second_half,
+            source_color[0][source_color_half_component_count + i],
+            source_color[0][i]);
+      }
+    }
+  }
+
+  if (output_fragment_stencil_ref != spv::NoResult &&
+      source_stencil[0] != spv::NoResult) {
+    // For the depth -> depth case, write the stencil directly to the output.
+    assert_true(mode.output == TransferOutput::kDepth);
+    builder.createStore(source_stencil[0], output_fragment_stencil_ref);
+  }
+
+  if (dest_is_64bpp) {
+    // Construct the 64bpp color from two 32-bit samples or one 64-bit sample.
+    // If `packed` (two uints) are created, use the generic path involving
+    // unpacking.
+    // Otherwise, the fragment data output must be written to directly by the
+    // reached control flow path.
+    spv::Id packed[2] = {};
+    if (source_is_color) {
+      switch (source_color_format) {
+        case xenos::ColorRenderTargetFormat::k_8_8_8_8:
+        case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: {
+          spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f);
+          spv::Id unorm_scale = builder.makeFloatConstant(255.0f);
+          spv::Id component_width = builder.makeUintConstant(8);
+          for (uint32_t i = 0; i < 2; ++i) {
+            packed[i] = builder.createUnaryOp(
+                spv::OpConvertFToU, type_uint,
+                builder.createBinOp(
+                    spv::OpFAdd, type_float,
+                    builder.createBinOp(spv::OpFMul, type_float,
+                                        source_color[i][0], unorm_scale),
+                    unorm_round_offset));
+            for (uint32_t j = 1; j < 4; ++j) {
+              id_vector_temp.clear();
+              id_vector_temp.reserve(4);
+              id_vector_temp.push_back(packed[i]);
+              id_vector_temp.push_back(builder.createUnaryOp(
+                  spv::OpConvertFToU, type_uint,
+                  builder.createBinOp(
+                      spv::OpFAdd, type_float,
+                      builder.createBinOp(spv::OpFMul, type_float,
+                                          source_color[i][j], unorm_scale),
+                      unorm_round_offset)));
+              id_vector_temp.push_back(builder.makeUintConstant(8 * j));
+              id_vector_temp.push_back(component_width);
+              packed[i] = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                           id_vector_temp);
+            }
+          }
+        } break;
+        case xenos::ColorRenderTargetFormat::k_2_10_10_10:
+        case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: {
+          spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f);
+          spv::Id unorm_scale_rgb = builder.makeFloatConstant(1023.0f);
+          spv::Id width_rgb = builder.makeUintConstant(10);
+          spv::Id unorm_scale_a = builder.makeFloatConstant(3.0f);
+          spv::Id width_a = builder.makeUintConstant(2);
+          for (uint32_t i = 0; i < 2; ++i) {
+            packed[i] = builder.createUnaryOp(
+                spv::OpConvertFToU, type_uint,
+                builder.createBinOp(
+                    spv::OpFAdd, type_float,
+                    builder.createBinOp(spv::OpFMul, type_float,
+                                        source_color[i][0], unorm_scale_rgb),
+                    unorm_round_offset));
+            for (uint32_t j = 1; j < 4; ++j) {
+              id_vector_temp.clear();
+              id_vector_temp.reserve(4);
+              id_vector_temp.push_back(packed[i]);
+              id_vector_temp.push_back(builder.createUnaryOp(
+                  spv::OpConvertFToU, type_uint,
+                  builder.createBinOp(
+                      spv::OpFAdd, type_float,
+                      builder.createBinOp(
+                          spv::OpFMul, type_float, source_color[i][j],
+                          j == 3 ? unorm_scale_a : unorm_scale_rgb),
+                      unorm_round_offset)));
+              id_vector_temp.push_back(builder.makeUintConstant(10 * j));
+              id_vector_temp.push_back(j == 3 ? width_a : width_rgb);
+              packed[i] = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                           id_vector_temp);
+            }
+          }
+        } break;
+        case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
+        case xenos::ColorRenderTargetFormat::
+            k_2_10_10_10_FLOAT_AS_16_16_16_16: {
+          spv::Id width_rgb = builder.makeUintConstant(10);
+          spv::Id float_0 = builder.makeFloatConstant(0.0f);
+          spv::Id float_1 = builder.makeFloatConstant(1.0f);
+          spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f);
+          spv::Id unorm_scale_a = builder.makeFloatConstant(3.0f);
+          spv::Id offset_a = builder.makeUintConstant(30);
+          spv::Id width_a = builder.makeUintConstant(2);
+          for (uint32_t i = 0; i < 2; ++i) {
+            // Float16 has a wider range for both color and alpha, also NaNs -
+            // clamp and convert.
+            packed[i] = SpirvShaderTranslator::UnclampedFloat32To7e3(
+                builder, source_color[i][0], ext_inst_glsl_std_450);
+            for (uint32_t j = 1; j < 3; ++j) {
+              id_vector_temp.clear();
+              id_vector_temp.reserve(4);
+              id_vector_temp.push_back(packed[i]);
+              id_vector_temp.push_back(
+                  SpirvShaderTranslator::UnclampedFloat32To7e3(
+                      builder, source_color[i][j], ext_inst_glsl_std_450));
+              id_vector_temp.push_back(builder.makeUintConstant(10 * j));
+              id_vector_temp.push_back(width_rgb);
+              packed[i] = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                           id_vector_temp);
+            }
+            // Saturate and convert the alpha.
+            id_vector_temp.clear();
+            id_vector_temp.reserve(3);
+            id_vector_temp.push_back(source_color[i][3]);
+            id_vector_temp.push_back(float_0);
+            id_vector_temp.push_back(float_1);
+            spv::Id alpha_saturated =
+                builder.createBuiltinCall(type_float, ext_inst_glsl_std_450,
+                                          GLSLstd450NClamp, id_vector_temp);
+            id_vector_temp.clear();
+            id_vector_temp.reserve(4);
+            id_vector_temp.push_back(packed[i]);
+            id_vector_temp.push_back(builder.createUnaryOp(
+                spv::OpConvertFToU, type_uint,
+                builder.createBinOp(
+                    spv::OpFAdd, type_float,
+                    builder.createBinOp(spv::OpFMul, type_float,
+                                        alpha_saturated, unorm_scale_a),
+                    unorm_round_offset)));
+            id_vector_temp.push_back(offset_a);
+            id_vector_temp.push_back(width_a);
+            packed[i] = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                         id_vector_temp);
+          }
+        } break;
+        // All 64bpp formats, and all 16 bits per component formats, are
+        // represented as integers in ownership transfer for safe handling of
+        // NaN encodings and -32768 / -32767.
+        // TODO(Triang3l): Handle the case when that's not true (no multisampled
+        // sampled images, no 16-bit UNORM, no cross-packing 32bpp aliasing on a
+        // portability subset device or a 64bpp format where that wouldn't help
+        // anyway).
+        case xenos::ColorRenderTargetFormat::k_16_16:
+        case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: {
+          if (dest_color_format ==
+              xenos::ColorRenderTargetFormat::k_32_32_FLOAT) {
+            spv::Id component_offset_width = builder.makeUintConstant(16);
+            spv::Id color_16_in_32[2];
+            for (uint32_t i = 0; i < 2; ++i) {
+              id_vector_temp.clear();
+              id_vector_temp.reserve(4);
+              id_vector_temp.push_back(source_color[i][0]);
+              id_vector_temp.push_back(source_color[i][1]);
+              id_vector_temp.push_back(component_offset_width);
+              id_vector_temp.push_back(component_offset_width);
+              color_16_in_32[i] = builder.createOp(spv::OpBitFieldInsert,
+                                                   type_uint, id_vector_temp);
+            }
+            id_vector_temp.clear();
+            id_vector_temp.reserve(2);
+            id_vector_temp.push_back(color_16_in_32[0]);
+            id_vector_temp.push_back(color_16_in_32[1]);
+            builder.createStore(builder.createCompositeConstruct(
+                                    type_fragment_data, id_vector_temp),
+                                output_fragment_data);
+          } else {
+            id_vector_temp.clear();
+            id_vector_temp.reserve(4);
+            for (uint32_t i = 0; i < 4; ++i) {
+              id_vector_temp.push_back(source_color[i >> 1][i & 1]);
+            }
+            builder.createStore(builder.createCompositeConstruct(
+                                    type_fragment_data, id_vector_temp),
+                                output_fragment_data);
+          }
+        } break;
+        case xenos::ColorRenderTargetFormat::k_16_16_16_16:
+        case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT: {
+          if (dest_color_format ==
+              xenos::ColorRenderTargetFormat::k_32_32_FLOAT) {
+            spv::Id component_offset_width = builder.makeUintConstant(16);
+            spv::Id color_16_in_32[2];
+            for (uint32_t i = 0; i < 2; ++i) {
+              id_vector_temp.clear();
+              id_vector_temp.reserve(4);
+              id_vector_temp.push_back(source_color[0][i << 1]);
+              id_vector_temp.push_back(source_color[0][(i << 1) + 1]);
+              id_vector_temp.push_back(component_offset_width);
+              id_vector_temp.push_back(component_offset_width);
+              color_16_in_32[i] = builder.createOp(spv::OpBitFieldInsert,
+                                                   type_uint, id_vector_temp);
+            }
+            id_vector_temp.clear();
+            id_vector_temp.reserve(2);
+            id_vector_temp.push_back(color_16_in_32[0]);
+            id_vector_temp.push_back(color_16_in_32[1]);
+            builder.createStore(builder.createCompositeConstruct(
+                                    type_fragment_data, id_vector_temp),
+                                output_fragment_data);
+          } else {
+            id_vector_temp.clear();
+            id_vector_temp.reserve(4);
+            for (uint32_t i = 0; i < 4; ++i) {
+              id_vector_temp.push_back(source_color[0][i]);
+            }
+            builder.createStore(builder.createCompositeConstruct(
+                                    type_fragment_data, id_vector_temp),
+                                output_fragment_data);
+          }
+        } break;
+        // Float32 is transferred as uint32 to preserve NaN encodings. However,
+        // multisampled sampled image support is optional in Vulkan.
+        case xenos::ColorRenderTargetFormat::k_32_FLOAT: {
+          for (uint32_t i = 0; i < 2; ++i) {
+            packed[i] = source_color[i][0];
+            if (!source_color_is_uint) {
+              packed[i] =
+                  builder.createUnaryOp(spv::OpBitcast, type_uint, packed[i]);
+            }
+          }
+        } break;
+        case xenos::ColorRenderTargetFormat::k_32_32_FLOAT: {
+          for (uint32_t i = 0; i < 2; ++i) {
+            packed[i] = source_color[0][i];
+            if (!source_color_is_uint) {
+              packed[i] =
+                  builder.createUnaryOp(spv::OpBitcast, type_uint, packed[i]);
+            }
+          }
+        } break;
+      }
+    } else {
+      assert_true(source_depth_texture != spv::NoResult);
+      assert_true(source_stencil_texture != spv::NoResult);
+      spv::Id depth_offset = builder.makeUintConstant(8);
+      spv::Id depth_width = builder.makeUintConstant(24);
+      for (uint32_t i = 0; i < 2; ++i) {
+        spv::Id depth24 = spv::NoResult;
+        switch (source_depth_format) {
+          case xenos::DepthRenderTargetFormat::kD24S8: {
+            // Round to the nearest even integer. This seems to be the
+            // correct, adding +0.5 and rounding towards zero results in red
+            // instead of black in the 4D5307E6 clear shader.
+            id_vector_temp.clear();
+            id_vector_temp.push_back(builder.createBinOp(
+                spv::OpFMul, type_float, source_depth_float[i],
+                builder.makeFloatConstant(float(0xFFFFFF))));
+            depth24 = builder.createUnaryOp(
+                spv::OpConvertFToU, type_uint,
+                builder.createBuiltinCall(type_float, ext_inst_glsl_std_450,
+                                          GLSLstd450RoundEven, id_vector_temp));
+          } break;
+          case xenos::DepthRenderTargetFormat::kD24FS8: {
+            depth24 = SpirvShaderTranslator::PreClampedDepthTo20e4(
+                builder, source_depth_float[i], true, ext_inst_glsl_std_450);
+          } break;
+        }
+        // Merge depth and stencil.
+        id_vector_temp.clear();
+        id_vector_temp.reserve(4);
+        id_vector_temp.push_back(source_stencil[i]);
+        id_vector_temp.push_back(depth24);
+        id_vector_temp.push_back(depth_offset);
+        id_vector_temp.push_back(depth_width);
+        packed[i] =
+            builder.createOp(spv::OpBitFieldInsert, type_uint, id_vector_temp);
+      }
+    }
+    // Common path unless there was a specialized one - unpack two packed 32-bit
+    // parts.
+    if (packed[0] != spv::NoResult) {
+      assert_true(packed[1] != spv::NoResult);
+      if (dest_color_format == xenos::ColorRenderTargetFormat::k_32_32_FLOAT) {
+        id_vector_temp.clear();
+        id_vector_temp.reserve(2);
+        id_vector_temp.push_back(packed[0]);
+        id_vector_temp.push_back(packed[1]);
+        // Multisampled sampled images are optional in Vulkan, and image views
+        // of different formats can't be created separately for sampled image
+        // and color attachment usages, so no multisampled integer sampled image
+        // support implies no multisampled integer framebuffer attachment
+        // support in Xenia.
+        if (!dest_color_is_uint) {
+          for (spv::Id& float32 : id_vector_temp) {
+            float32 =
+                builder.createUnaryOp(spv::OpBitcast, type_float, float32);
+          }
+        }
+        builder.createStore(builder.createCompositeConstruct(type_fragment_data,
+                                                             id_vector_temp),
+                            output_fragment_data);
+      } else {
+        spv::Id const_uint_0 = builder.makeUintConstant(0);
+        spv::Id const_uint_16 = builder.makeUintConstant(16);
+        id_vector_temp.clear();
+        id_vector_temp.reserve(4);
+        for (uint32_t i = 0; i < 4; ++i) {
+          id_vector_temp.push_back(builder.createTriOp(
+              spv::OpBitFieldUExtract, type_uint, packed[i >> 1],
+              (i & 1) ? const_uint_16 : const_uint_0, const_uint_16));
+        }
+        // TODO(Triang3l): Handle the case when that's not true (no multisampled
+        // sampled images, no 16-bit UNORM, no cross-packing 32bpp aliasing on a
+        // portability subset device or a 64bpp format where that wouldn't help
+        // anyway).
+        builder.createStore(builder.createCompositeConstruct(type_fragment_data,
+                                                             id_vector_temp),
+                            output_fragment_data);
+      }
+    }
+  } else {
+    // If `packed` is created, use the generic path involving unpacking.
+    // - For a color destination, the packed 32bpp color.
+    // - For a depth / stencil destination, stencil in 0:7, depth in 8:31
+    //   normally, or depth in 0:23 and zeros in 24:31 with packed_only_depth.
+    // - For a stencil bit, stencil in 0:7.
+    // Otherwise, the fragment data or fragment depth / stencil output must be
+    // written to directly by the reached control flow path.
+    spv::Id packed = spv::NoResult;
+    bool packed_only_depth = false;
+    if (source_is_color) {
+      switch (source_color_format) {
+        case xenos::ColorRenderTargetFormat::k_8_8_8_8:
+        case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: {
+          if (dest_is_color &&
+              (dest_color_format == xenos::ColorRenderTargetFormat::k_8_8_8_8 ||
+               dest_color_format ==
+                   xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA)) {
+            // Same format - passthrough.
+            id_vector_temp.clear();
+            id_vector_temp.reserve(4);
+            for (uint32_t i = 0; i < 4; ++i) {
+              id_vector_temp.push_back(source_color[0][i]);
+            }
+            builder.createStore(builder.createCompositeConstruct(
+                                    type_fragment_data, id_vector_temp),
+                                output_fragment_data);
+          } else {
+            spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f);
+            spv::Id unorm_scale = builder.makeFloatConstant(255.0f);
+            uint32_t packed_component_offset = 0;
+            if (mode.output == TransferOutput::kDepth) {
+              // When need only depth, not stencil, skip the red component, and
+              // put the depth from GBA directly in the lower bits.
+              packed_component_offset = 1;
+              packed_only_depth = true;
+              if (output_fragment_stencil_ref != spv::NoResult) {
+                builder.createStore(
+                    builder.createUnaryOp(
+                        spv::OpConvertFToU, type_uint,
+                        builder.createBinOp(
+                            spv::OpFAdd, type_float,
+                            builder.createBinOp(spv::OpFMul, type_float,
+                                                source_color[0][0],
+                                                unorm_scale),
+                            unorm_round_offset)),
+                    output_fragment_stencil_ref);
+              }
+            }
+            packed = builder.createUnaryOp(
+                spv::OpConvertFToU, type_uint,
+                builder.createBinOp(
+                    spv::OpFAdd, type_float,
+                    builder.createBinOp(
+                        spv::OpFMul, type_float,
+                        source_color[0][packed_component_offset], unorm_scale),
+                    unorm_round_offset));
+            if (mode.output != TransferOutput::kStencilBit) {
+              spv::Id component_width = builder.makeUintConstant(8);
+              for (uint32_t i = 1; i < 4 - packed_component_offset; ++i) {
+                id_vector_temp.clear();
+                id_vector_temp.reserve(4);
+                id_vector_temp.push_back(packed);
+                id_vector_temp.push_back(builder.createUnaryOp(
+                    spv::OpConvertFToU, type_uint,
+                    builder.createBinOp(
+                        spv::OpFAdd, type_float,
+                        builder.createBinOp(
+                            spv::OpFMul, type_float,
+                            source_color[0][packed_component_offset + i],
+                            unorm_scale),
+                        unorm_round_offset)));
+                id_vector_temp.push_back(builder.makeUintConstant(8 * i));
+                id_vector_temp.push_back(component_width);
+                packed = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                          id_vector_temp);
+              }
+            }
+          }
+        } break;
+        case xenos::ColorRenderTargetFormat::k_2_10_10_10:
+        case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: {
+          if (dest_is_color &&
+              (dest_color_format ==
+                   xenos::ColorRenderTargetFormat::k_2_10_10_10 ||
+               dest_color_format == xenos::ColorRenderTargetFormat::
+                                        k_2_10_10_10_AS_10_10_10_10)) {
+            id_vector_temp.clear();
+            id_vector_temp.reserve(4);
+            for (uint32_t i = 0; i < 4; ++i) {
+              id_vector_temp.push_back(source_color[0][i]);
+            }
+            builder.createStore(builder.createCompositeConstruct(
+                                    type_fragment_data, id_vector_temp),
+                                output_fragment_data);
+          } else {
+            spv::Id unorm_round_offset = builder.makeFloatConstant(0.5f);
+            spv::Id unorm_scale_rgb = builder.makeFloatConstant(1023.0f);
+            packed = builder.createUnaryOp(
+                spv::OpConvertFToU, type_uint,
+                builder.createBinOp(
+                    spv::OpFAdd, type_float,
+                    builder.createBinOp(spv::OpFMul, type_float,
+                                        source_color[0][0], unorm_scale_rgb),
+                    unorm_round_offset));
+            if (mode.output != TransferOutput::kStencilBit) {
+              spv::Id width_rgb = builder.makeUintConstant(10);
+              spv::Id unorm_scale_a = builder.makeFloatConstant(3.0f);
+              spv::Id width_a = builder.makeUintConstant(2);
+              for (uint32_t i = 1; i < 4; ++i) {
+                id_vector_temp.clear();
+                id_vector_temp.reserve(4);
+                id_vector_temp.push_back(packed);
+                id_vector_temp.push_back(builder.createUnaryOp(
+                    spv::OpConvertFToU, type_uint,
+                    builder.createBinOp(
+                        spv::OpFAdd, type_float,
+                        builder.createBinOp(
+                            spv::OpFMul, type_float, source_color[0][i],
+                            i == 3 ? unorm_scale_a : unorm_scale_rgb),
+                        unorm_round_offset)));
+                id_vector_temp.push_back(builder.makeUintConstant(10 * i));
+                id_vector_temp.push_back(i == 3 ? width_a : width_rgb);
+                packed = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                          id_vector_temp);
+              }
+            }
+          }
+        } break;
+        case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
+        case xenos::ColorRenderTargetFormat::
+            k_2_10_10_10_FLOAT_AS_16_16_16_16: {
+          if (dest_is_color &&
+              (dest_color_format ==
+                   xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT ||
+               dest_color_format == xenos::ColorRenderTargetFormat::
+                                        k_2_10_10_10_FLOAT_AS_16_16_16_16)) {
+            id_vector_temp.clear();
+            id_vector_temp.reserve(4);
+            for (uint32_t i = 0; i < 4; ++i) {
+              id_vector_temp.push_back(source_color[0][i]);
+            }
+            builder.createStore(builder.createCompositeConstruct(
+                                    type_fragment_data, id_vector_temp),
+                                output_fragment_data);
+          } else {
+            // Float16 has a wider range for both color and alpha, also NaNs -
+            // clamp and convert.
+            packed = SpirvShaderTranslator::UnclampedFloat32To7e3(
+                builder, source_color[0][0], ext_inst_glsl_std_450);
+            if (mode.output != TransferOutput::kStencilBit) {
+              spv::Id width_rgb = builder.makeUintConstant(10);
+              for (uint32_t i = 1; i < 3; ++i) {
+                id_vector_temp.clear();
+                id_vector_temp.reserve(4);
+                id_vector_temp.push_back(packed);
+                id_vector_temp.push_back(
+                    SpirvShaderTranslator::UnclampedFloat32To7e3(
+                        builder, source_color[0][i], ext_inst_glsl_std_450));
+                id_vector_temp.push_back(builder.makeUintConstant(10 * i));
+                id_vector_temp.push_back(width_rgb);
+                packed = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                          id_vector_temp);
+              }
+              // Saturate and convert the alpha.
+              id_vector_temp.clear();
+              id_vector_temp.reserve(3);
+              id_vector_temp.push_back(source_color[0][3]);
+              id_vector_temp.push_back(builder.makeFloatConstant(0.0f));
+              id_vector_temp.push_back(builder.makeFloatConstant(1.0f));
+              spv::Id alpha_saturated =
+                  builder.createBuiltinCall(type_float, ext_inst_glsl_std_450,
+                                            GLSLstd450NClamp, id_vector_temp);
+              id_vector_temp.clear();
+              id_vector_temp.reserve(4);
+              id_vector_temp.push_back(packed);
+              id_vector_temp.push_back(builder.createUnaryOp(
+                  spv::OpConvertFToU, type_uint,
+                  builder.createBinOp(
+                      spv::OpFAdd, type_float,
+                      builder.createBinOp(spv::OpFMul, type_float,
+                                          alpha_saturated,
+                                          builder.makeFloatConstant(3.0f)),
+                      builder.makeFloatConstant(0.5f))));
+              id_vector_temp.push_back(builder.makeUintConstant(30));
+              id_vector_temp.push_back(builder.makeUintConstant(2));
+              packed = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                        id_vector_temp);
+            }
+          }
+        } break;
+        case xenos::ColorRenderTargetFormat::k_16_16:
+        case xenos::ColorRenderTargetFormat::k_16_16_16_16:
+        case xenos::ColorRenderTargetFormat::k_16_16_FLOAT:
+        case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT: {
+          // All 64bpp formats, and all 16 bits per component formats, are
+          // represented as integers in ownership transfer for safe handling of
+          // NaN encodings and -32768 / -32767.
+          // TODO(Triang3l): Handle the case when that's not true (no
+          // multisampled sampled images, no 16-bit UNORM, no cross-packing
+          // 32bpp aliasing on a portability subset device or a 64bpp format
+          // where that wouldn't help anyway).
+          if (dest_is_color &&
+              (dest_color_format == xenos::ColorRenderTargetFormat::k_16_16 ||
+               dest_color_format ==
+                   xenos::ColorRenderTargetFormat::k_16_16_FLOAT)) {
+            id_vector_temp.clear();
+            id_vector_temp.reserve(2);
+            for (uint32_t i = 0; i < 2; ++i) {
+              id_vector_temp.push_back(source_color[0][i]);
+            }
+            builder.createStore(builder.createCompositeConstruct(
+                                    type_fragment_data, id_vector_temp),
+                                output_fragment_data);
+          } else {
+            packed = source_color[0][0];
+            if (mode.output != TransferOutput::kStencilBit) {
+              spv::Id component_offset_width = builder.makeUintConstant(16);
+              id_vector_temp.clear();
+              id_vector_temp.reserve(4);
+              id_vector_temp.push_back(packed);
+              id_vector_temp.push_back(source_color[0][1]);
+              id_vector_temp.push_back(component_offset_width);
+              id_vector_temp.push_back(component_offset_width);
+              packed = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                        id_vector_temp);
+            }
+          }
+        } break;
+        // Float32 is transferred as uint32 to preserve NaN encodings. However,
+        // multisampled sampled image support is optional in Vulkan.
+        case xenos::ColorRenderTargetFormat::k_32_FLOAT:
+        case xenos::ColorRenderTargetFormat::k_32_32_FLOAT: {
+          packed = source_color[0][0];
+          if (!source_color_is_uint) {
+            packed = builder.createUnaryOp(spv::OpBitcast, type_uint, packed);
+          }
+        } break;
+      }
+    } else if (source_depth_float[0] != spv::NoResult) {
+      if (mode.output == TransferOutput::kDepth &&
+          dest_depth_format == source_depth_format) {
+        builder.createStore(source_depth_float[0], output_fragment_depth);
+      } else {
+        switch (source_depth_format) {
+          case xenos::DepthRenderTargetFormat::kD24S8: {
+            // Round to the nearest even integer. This seems to be the correct,
+            // adding +0.5 and rounding towards zero results in red instead of
+            // black in the 4D5307E6 clear shader.
+            id_vector_temp.clear();
+            id_vector_temp.push_back(builder.createBinOp(
+                spv::OpFMul, type_float, source_depth_float[0],
+                builder.makeFloatConstant(float(0xFFFFFF))));
+            packed = builder.createUnaryOp(
+                spv::OpConvertFToU, type_uint,
+                builder.createBuiltinCall(type_float, ext_inst_glsl_std_450,
+                                          GLSLstd450RoundEven, id_vector_temp));
+          } break;
+          case xenos::DepthRenderTargetFormat::kD24FS8: {
+            packed = SpirvShaderTranslator::PreClampedDepthTo20e4(
+                builder, source_depth_float[0], true, ext_inst_glsl_std_450);
+          } break;
+        }
+        if (mode.output == TransferOutput::kDepth) {
+          packed_only_depth = true;
+        } else {
+          // Merge depth and stencil.
+          id_vector_temp.clear();
+          id_vector_temp.reserve(4);
+          id_vector_temp.push_back(source_stencil[0]);
+          id_vector_temp.push_back(packed);
+          id_vector_temp.push_back(builder.makeUintConstant(8));
+          id_vector_temp.push_back(builder.makeUintConstant(24));
+          packed = builder.createOp(spv::OpBitFieldInsert, type_uint,
+                                    id_vector_temp);
+        }
+      }
+    }
+    switch (mode.output) {
+      case TransferOutput::kColor: {
+        // Unless a special path was taken, unpack the raw 32bpp value into the
+        // 32bpp color output.
+        if (packed != spv::NoResult) {
+          switch (dest_color_format) {
+            case xenos::ColorRenderTargetFormat::k_8_8_8_8:
+            case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: {
+              spv::Id component_width = builder.makeUintConstant(8);
+              spv::Id unorm_scale = builder.makeFloatConstant(1.0f / 255.0f);
+              id_vector_temp.clear();
+              id_vector_temp.reserve(4);
+              for (uint32_t i = 0; i < 4; ++i) {
+                id_vector_temp.push_back(builder.createBinOp(
+                    spv::OpFMul, type_float,
+                    builder.createUnaryOp(
+                        spv::OpConvertUToF, type_float,
+                        builder.createTriOp(
+                            spv::OpBitFieldUExtract, type_uint, packed,
+                            builder.makeUintConstant(8 * i), component_width)),
+                    unorm_scale));
+              }
+              builder.createStore(builder.createCompositeConstruct(
+                                      type_fragment_data, id_vector_temp),
+                                  output_fragment_data);
+            } break;
+            case xenos::ColorRenderTargetFormat::k_2_10_10_10:
+            case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: {
+              spv::Id width_rgb = builder.makeUintConstant(10);
+              spv::Id unorm_scale_rgb =
+                  builder.makeFloatConstant(1.0f / 1023.0f);
+              spv::Id width_a = builder.makeUintConstant(2);
+              spv::Id unorm_scale_a = builder.makeFloatConstant(1.0f / 3.0f);
+              id_vector_temp.clear();
+              id_vector_temp.reserve(4);
+              for (uint32_t i = 0; i < 4; ++i) {
+                id_vector_temp.push_back(builder.createBinOp(
+                    spv::OpFMul, type_float,
+                    builder.createUnaryOp(
+                        spv::OpConvertUToF, type_float,
+                        builder.createTriOp(spv::OpBitFieldUExtract, type_uint,
+                                            packed,
+                                            builder.makeUintConstant(10 * i),
+                                            i == 3 ? width_a : width_rgb)),
+                    i == 3 ? unorm_scale_a : unorm_scale_rgb));
+              }
+              builder.createStore(builder.createCompositeConstruct(
+                                      type_fragment_data, id_vector_temp),
+                                  output_fragment_data);
+            } break;
+            case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
+            case xenos::ColorRenderTargetFormat::
+                k_2_10_10_10_FLOAT_AS_16_16_16_16: {
+              id_vector_temp.clear();
+              id_vector_temp.reserve(4);
+              // Color.
+              spv::Id width_rgb = builder.makeUintConstant(10);
+              for (uint32_t i = 0; i < 3; ++i) {
+                id_vector_temp.push_back(SpirvShaderTranslator::Float7e3To32(
+                    builder, packed, 10 * i, false, ext_inst_glsl_std_450));
+              }
+              // Alpha.
+              id_vector_temp.push_back(builder.createBinOp(
+                  spv::OpFMul, type_float,
+                  builder.createUnaryOp(
+                      spv::OpConvertUToF, type_float,
+                      builder.createTriOp(spv::OpBitFieldUExtract, type_uint,
+                                          packed, builder.makeUintConstant(30),
+                                          builder.makeUintConstant(2))),
+                  builder.makeFloatConstant(1.0f / 3.0f)));
+              builder.createStore(builder.createCompositeConstruct(
+                                      type_fragment_data, id_vector_temp),
+                                  output_fragment_data);
+            } break;
+            case xenos::ColorRenderTargetFormat::k_16_16:
+            case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: {
+              // All 16 bits per component formats are represented as integers
+              // in ownership transfer for safe handling of NaN encodings and
+              // -32768 / -32767.
+              // TODO(Triang3l): Handle the case when that's not true (no
+              // multisampled sampled images, no 16-bit UNORM, no cross-packing
+              // 32bpp aliasing on a portability subset device or a 64bpp format
+              // where that wouldn't help anyway).
+              spv::Id component_offset_width = builder.makeUintConstant(16);
+              id_vector_temp.clear();
+              id_vector_temp.reserve(2);
+              for (uint32_t i = 0; i < 2; ++i) {
+                id_vector_temp.push_back(builder.createTriOp(
+                    spv::OpBitFieldUExtract, type_uint, packed,
+                    i ? component_offset_width : builder.makeUintConstant(0),
+                    component_offset_width));
+              }
+              builder.createStore(builder.createCompositeConstruct(
+                                      type_fragment_data, id_vector_temp),
+                                  output_fragment_data);
+            } break;
+            case xenos::ColorRenderTargetFormat::k_32_FLOAT: {
+              // Float32 is transferred as uint32 to preserve NaN encodings.
+              // However, multisampled sampled images are optional in Vulkan,
+              // and image views of different formats can't be created
+              // separately for sampled image and color attachment usages, so no
+              // multisampled integer sampled image support implies no
+              // multisampled integer framebuffer attachment support in Xenia.
+              spv::Id float32 = packed;
+              if (!dest_color_is_uint) {
+                float32 =
+                    builder.createUnaryOp(spv::OpBitcast, type_float, float32);
+              }
+              builder.createStore(float32, output_fragment_data);
+            } break;
+            default:
+              // A 64bpp format (handled separately) or an invalid one.
+              assert_unhandled_case(dest_color_format);
+          }
+        }
+      } break;
+      case TransferOutput::kDepth: {
+        if (packed) {
+          spv::Id guest_depth24 = packed;
+          if (!packed_only_depth) {
+            // Extract the depth bits.
+            guest_depth24 =
+                builder.createBinOp(spv::OpShiftRightLogical, type_uint,
+                                    guest_depth24, builder.makeUintConstant(8));
+          }
+          // Load the host float32 depth, check if, when converted to the guest
+          // format, it's the same as the guest source, thus up to date, and if
+          // it is, write host float32 depth, otherwise do the guest -> host
+          // conversion.
+          spv::Id host_depth32 = spv::NoResult;
+          if (host_depth_source_texture != spv::NoResult) {
+            // Convert position and sample index from within the destination
+            // tile to within the host depth source tile, like for the guest
+            // render target, but for 32bpp -> 32bpp only.
+            spv::Id host_depth_source_sample_id = dest_sample_id;
+            spv::Id host_depth_source_tile_pixel_x = dest_tile_pixel_x;
+            spv::Id host_depth_source_tile_pixel_y = dest_tile_pixel_y;
+            if (key.host_depth_source_msaa_samples != key.dest_msaa_samples) {
+              if (key.host_depth_source_msaa_samples >=
+                  xenos::MsaaSamples::k4X) {
+                // 4x -> 1x/2x.
+                if (key.dest_msaa_samples == xenos::MsaaSamples::k2X) {
+                  // 4x -> 2x.
+                  // Horizontal pixels to samples. Vertical sample (1/0 in the
+                  // first bit for native 2x or 0/1 in the second bit for 2x as
+                  // 4x) to second sample bit.
+                  if (msaa_2x_attachments_supported_) {
+                    id_vector_temp.clear();
+                    id_vector_temp.reserve(4);
+                    id_vector_temp.push_back(dest_tile_pixel_x);
+                    id_vector_temp.push_back(builder.createBinOp(
+                        spv::OpBitwiseXor, type_uint, dest_sample_id,
+                        builder.makeUintConstant(1)));
+                    id_vector_temp.push_back(builder.makeUintConstant(1));
+                    id_vector_temp.push_back(builder.makeUintConstant(31));
+                    host_depth_source_sample_id = builder.createOp(
+                        spv::OpBitFieldInsert, type_uint, id_vector_temp);
+                  } else {
+                    id_vector_temp.clear();
+                    id_vector_temp.reserve(4);
+                    id_vector_temp.push_back(dest_sample_id);
+                    id_vector_temp.push_back(dest_tile_pixel_x);
+                    id_vector_temp.push_back(builder.makeUintConstant(0));
+                    id_vector_temp.push_back(builder.makeUintConstant(1));
+                    host_depth_source_sample_id = builder.createOp(
+                        spv::OpBitFieldInsert, type_uint, id_vector_temp);
+                  }
+                  host_depth_source_tile_pixel_x = builder.createBinOp(
+                      spv::OpShiftRightLogical, type_uint, dest_tile_pixel_x,
+                      builder.makeUintConstant(1));
+                } else {
+                  // 4x -> 1x.
+                  // Pixels to samples.
+                  id_vector_temp.clear();
+                  id_vector_temp.reserve(4);
+                  id_vector_temp.push_back(builder.createBinOp(
+                      spv::OpBitwiseAnd, type_uint, dest_tile_pixel_x,
+                      builder.makeUintConstant(1)));
+                  id_vector_temp.push_back(dest_tile_pixel_y);
+                  id_vector_temp.push_back(builder.makeUintConstant(1));
+                  id_vector_temp.push_back(builder.makeUintConstant(1));
+                  host_depth_source_sample_id = builder.createOp(
+                      spv::OpBitFieldInsert, type_uint, id_vector_temp);
+                  host_depth_source_tile_pixel_x = builder.createBinOp(
+                      spv::OpShiftRightLogical, type_uint, dest_tile_pixel_x,
+                      builder.makeUintConstant(1));
+                  host_depth_source_tile_pixel_y = builder.createBinOp(
+                      spv::OpShiftRightLogical, type_uint, dest_tile_pixel_y,
+                      builder.makeUintConstant(1));
+                }
+              } else {
+                // 1x/2x -> 1x/2x/4x (as long as they're different).
+                // Only the X part - Y is handled by common code.
+                if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) {
+                  // Horizontal samples to pixels.
+                  id_vector_temp.clear();
+                  id_vector_temp.reserve(4);
+                  id_vector_temp.push_back(dest_sample_id);
+                  id_vector_temp.push_back(dest_tile_pixel_x);
+                  id_vector_temp.push_back(builder.makeUintConstant(1));
+                  id_vector_temp.push_back(builder.makeUintConstant(31));
+                  host_depth_source_tile_pixel_x = builder.createOp(
+                      spv::OpBitFieldInsert, type_uint, id_vector_temp);
+                }
+              }
+              // Host depth source Y and sample index for 1x/2x AA sources.
+              if (key.host_depth_source_msaa_samples <
+                  xenos::MsaaSamples::k4X) {
+                if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) {
+                  // 1x/2x -> 4x.
+                  if (key.host_depth_source_msaa_samples ==
+                      xenos::MsaaSamples::k2X) {
+                    // 2x -> 4x.
+                    // Vertical samples (second bit) of 4x destination to
+                    // vertical sample (1, 0 for native 2x, or 0, 3 for 2x as
+                    // 4x) of 2x source.
+                    host_depth_source_sample_id = builder.createBinOp(
+                        spv::OpShiftRightLogical, type_uint, dest_sample_id,
+                        builder.makeUintConstant(1));
+                    if (msaa_2x_attachments_supported_) {
+                      host_depth_source_sample_id =
+                          builder.createBinOp(spv::OpBitwiseXor, type_uint,
+                                              host_depth_source_sample_id,
+                                              builder.makeUintConstant(1));
+                    } else {
+                      id_vector_temp.clear();
+                      id_vector_temp.reserve(4);
+                      id_vector_temp.push_back(host_depth_source_sample_id);
+                      id_vector_temp.push_back(host_depth_source_sample_id);
+                      id_vector_temp.push_back(builder.makeUintConstant(1));
+                      id_vector_temp.push_back(builder.makeUintConstant(1));
+                      host_depth_source_sample_id = builder.createOp(
+                          spv::OpBitFieldInsert, type_uint, id_vector_temp);
+                    }
+                  } else {
+                    // 1x -> 4x.
+                    // Vertical samples (second bit) to Y pixels.
+                    id_vector_temp.clear();
+                    id_vector_temp.reserve(4);
+                    id_vector_temp.push_back(builder.createBinOp(
+                        spv::OpShiftRightLogical, type_uint, dest_sample_id,
+                        builder.makeUintConstant(1)));
+                    id_vector_temp.push_back(dest_tile_pixel_y);
+                    id_vector_temp.push_back(builder.makeUintConstant(1));
+                    id_vector_temp.push_back(builder.makeUintConstant(31));
+                    host_depth_source_tile_pixel_y = builder.createOp(
+                        spv::OpBitFieldInsert, type_uint, id_vector_temp);
+                  }
+                } else {
+                  // 1x/2x -> different 1x/2x.
+                  if (key.host_depth_source_msaa_samples ==
+                      xenos::MsaaSamples::k2X) {
+                    // 2x -> 1x.
+                    // Vertical pixels of 2x destination to vertical samples (1,
+                    // 0 for native 2x, or 0, 3 for 2x as 4x) of 1x source.
+                    host_depth_source_sample_id = builder.createBinOp(
+                        spv::OpBitwiseAnd, type_uint, dest_tile_pixel_y,
+                        builder.makeUintConstant(1));
+                    if (msaa_2x_attachments_supported_) {
+                      host_depth_source_sample_id =
+                          builder.createBinOp(spv::OpBitwiseXor, type_uint,
+                                              host_depth_source_sample_id,
+                                              builder.makeUintConstant(1));
+                    } else {
+                      id_vector_temp.clear();
+                      id_vector_temp.reserve(4);
+                      id_vector_temp.push_back(host_depth_source_sample_id);
+                      id_vector_temp.push_back(host_depth_source_sample_id);
+                      id_vector_temp.push_back(builder.makeUintConstant(1));
+                      id_vector_temp.push_back(builder.makeUintConstant(1));
+                      host_depth_source_sample_id = builder.createOp(
+                          spv::OpBitFieldInsert, type_uint, id_vector_temp);
+                    }
+                    host_depth_source_tile_pixel_y = builder.createBinOp(
+                        spv::OpShiftRightLogical, type_uint, dest_tile_pixel_y,
+                        builder.makeUintConstant(1));
+                  } else {
+                    // 1x -> 2x.
+                    // Vertical samples (1/0 in the first bit for native 2x or
+                    // 0/1 in the second bit for 2x as 4x) of 2x destination to
+                    // vertical pixels of 1x source.
+                    if (msaa_2x_attachments_supported_) {
+                      id_vector_temp.clear();
+                      id_vector_temp.reserve(4);
+                      id_vector_temp.push_back(builder.createBinOp(
+                          spv::OpBitwiseXor, type_uint, dest_sample_id,
+                          builder.makeUintConstant(1)));
+                      id_vector_temp.push_back(dest_tile_pixel_y);
+                      id_vector_temp.push_back(builder.makeUintConstant(1));
+                      id_vector_temp.push_back(builder.makeUintConstant(31));
+                      host_depth_source_tile_pixel_y = builder.createOp(
+                          spv::OpBitFieldInsert, type_uint, id_vector_temp);
+                    } else {
+                      id_vector_temp.clear();
+                      id_vector_temp.reserve(4);
+                      id_vector_temp.push_back(builder.createBinOp(
+                          spv::OpShiftRightLogical, type_uint, dest_sample_id,
+                          builder.makeUintConstant(1)));
+                      id_vector_temp.push_back(dest_tile_pixel_y);
+                      id_vector_temp.push_back(builder.makeUintConstant(1));
+                      id_vector_temp.push_back(builder.makeUintConstant(31));
+                      host_depth_source_tile_pixel_y = builder.createOp(
+                          spv::OpBitFieldInsert, type_uint, id_vector_temp);
+                    }
+                  }
+                }
+              }
+            }
+            assert_true(push_constants_member_host_depth_address != UINT32_MAX);
+            id_vector_temp.clear();
+            id_vector_temp.push_back(builder.makeIntConstant(
+                int32_t(push_constants_member_host_depth_address)));
+            spv::Id host_depth_address_constant = builder.createLoad(
+                builder.createAccessChain(spv::StorageClassPushConstant,
+                                          push_constants, id_vector_temp),
+                spv::NoPrecision);
+            // Transform the destination tile index into the host depth source.
+            spv::Id host_depth_source_tile_index = builder.createUnaryOp(
+                spv::OpBitcast, type_uint,
+                builder.createBinOp(
+                    spv::OpIAdd, type_int,
+                    builder.createUnaryOp(spv::OpBitcast, type_int,
+                                          dest_tile_index),
+                    builder.createTriOp(
+                        spv::OpBitFieldSExtract, type_int,
+                        builder.createUnaryOp(spv::OpBitcast, type_int,
+                                              host_depth_address_constant),
+                        builder.makeUintConstant(xenos::kEdramPitchTilesBits *
+                                                 2),
+                        builder.makeUintConstant(xenos::kEdramBaseTilesBits))));
+            // Split the host depth source tile index into X and Y tile index
+            // within the source image.
+            spv::Id host_depth_source_pitch_tiles = builder.createTriOp(
+                spv::OpBitFieldUExtract, type_uint, host_depth_address_constant,
+                builder.makeUintConstant(xenos::kEdramPitchTilesBits),
+                builder.makeUintConstant(xenos::kEdramPitchTilesBits));
+            spv::Id host_depth_source_tile_index_y = builder.createBinOp(
+                spv::OpUDiv, type_uint, host_depth_source_tile_index,
+                host_depth_source_pitch_tiles);
+            spv::Id host_depth_source_tile_index_x = builder.createBinOp(
+                spv::OpUMod, type_uint, host_depth_source_tile_index,
+                host_depth_source_pitch_tiles);
+            // Finally calculate the host depth source texture coordinates.
+            spv::Id host_depth_source_pixel_x_int = builder.createUnaryOp(
+                spv::OpBitcast, type_int,
+                builder.createBinOp(
+                    spv::OpIAdd, type_uint,
+                    builder.createBinOp(spv::OpIMul, type_uint,
+                                        builder.makeUintConstant(
+                                            tile_width_samples_scaled >>
+                                            uint32_t(key.source_msaa_samples >=
+                                                     xenos::MsaaSamples::k4X)),
+                                        host_depth_source_tile_index_x),
+                    host_depth_source_tile_pixel_x));
+            spv::Id host_depth_source_pixel_y_int = builder.createUnaryOp(
+                spv::OpBitcast, type_int,
+                builder.createBinOp(
+                    spv::OpIAdd, type_uint,
+                    builder.createBinOp(spv::OpIMul, type_uint,
+                                        builder.makeUintConstant(
+                                            tile_height_samples_scaled >>
+                                            uint32_t(key.source_msaa_samples >=
+                                                     xenos::MsaaSamples::k2X)),
+                                        host_depth_source_tile_index_y),
+                    host_depth_source_tile_pixel_y));
+            // Load the host depth source.
+            spv::Builder::TextureParameters
+                host_depth_source_texture_parameters = {};
+            host_depth_source_texture_parameters.sampler =
+                builder.createLoad(host_depth_source_texture, spv::NoPrecision);
+            id_vector_temp.clear();
+            id_vector_temp.reserve(2);
+            id_vector_temp.push_back(host_depth_source_pixel_x_int);
+            id_vector_temp.push_back(host_depth_source_pixel_y_int);
+            host_depth_source_texture_parameters.coords =
+                builder.createCompositeConstruct(type_int2, id_vector_temp);
+            if (key.host_depth_source_msaa_samples != xenos::MsaaSamples::k1X) {
+              host_depth_source_texture_parameters.sample =
+                  builder.createUnaryOp(spv::OpBitcast, type_int,
+                                        host_depth_source_sample_id);
+            } else {
+              host_depth_source_texture_parameters.lod =
+                  builder.makeIntConstant(0);
+            }
+            host_depth32 = builder.createCompositeExtract(
+                builder.createTextureCall(spv::NoPrecision, type_float4, false,
+                                          true, false, false, false,
+                                          host_depth_source_texture_parameters,
+                                          spv::ImageOperandsMaskNone),
+                type_float, 0);
+          } else if (host_depth_source_buffer != spv::NoResult) {
+            // Get the address in the EDRAM scratch buffer and load from there.
+            // The beginning of the buffer is (0, 0) of the destination.
+            // 40-sample columns are not swapped for addressing simplicity
+            // (because this is used for depth -> depth transfers, where
+            // swapping isn't needed).
+            // Convert samples to pixels.
+            assert_true(key.host_depth_source_msaa_samples ==
+                        xenos::MsaaSamples::k1X);
+            spv::Id dest_tile_sample_x = dest_tile_pixel_x;
+            spv::Id dest_tile_sample_y = dest_tile_pixel_y;
+            if (key.dest_msaa_samples >= xenos::MsaaSamples::k2X) {
+              if (key.dest_msaa_samples >= xenos::MsaaSamples::k4X) {
+                // Horizontal sample index in bit 0.
+                id_vector_temp.clear();
+                id_vector_temp.reserve(4);
+                id_vector_temp.push_back(dest_sample_id);
+                id_vector_temp.push_back(dest_tile_pixel_x);
+                id_vector_temp.push_back(builder.makeUintConstant(1));
+                id_vector_temp.push_back(builder.makeUintConstant(31));
+                dest_tile_sample_x = builder.createOp(
+                    spv::OpBitFieldInsert, type_uint, id_vector_temp);
+              }
+              // Vertical sample index as 1 or 0 in bit 0 for true 2x or as 0
+              // or 1 in bit 1 for 4x or for 2x emulated as 4x.
+              id_vector_temp.clear();
+              id_vector_temp.reserve(4);
+              id_vector_temp.push_back(builder.createBinOp(
+                  (key.dest_msaa_samples == xenos::MsaaSamples::k2X &&
+                   msaa_2x_attachments_supported_)
+                      ? spv::OpBitwiseXor
+                      : spv::OpShiftRightLogical,
+                  type_uint, dest_sample_id, builder.makeUintConstant(1)));
+              id_vector_temp.push_back(dest_tile_pixel_y);
+              id_vector_temp.push_back(builder.makeUintConstant(1));
+              id_vector_temp.push_back(builder.makeUintConstant(31));
+              dest_tile_sample_y = builder.createOp(spv::OpBitFieldInsert,
+                                                    type_uint, id_vector_temp);
+            }
+            // Combine the tile sample index and the tile index.
+            spv::Id host_depth_offset = builder.createBinOp(
+                spv::OpIAdd, type_uint,
+                builder.createBinOp(
+                    spv::OpIMul, type_uint,
+                    builder.makeUintConstant(tile_width_samples_scaled *
+                                             tile_height_samples_scaled),
+                    dest_tile_index),
+                builder.createBinOp(
+                    spv::OpIAdd, type_uint,
+                    builder.createBinOp(
+                        spv::OpIMul, type_uint,
+                        builder.makeUintConstant(tile_width_samples_scaled),
+                        dest_tile_sample_y),
+                    dest_tile_sample_x));
+            id_vector_temp.clear();
+            id_vector_temp.reserve(2);
+            // The only SSBO structure member.
+            id_vector_temp.push_back(builder.makeIntConstant(0));
+            id_vector_temp.push_back(builder.createUnaryOp(
+                spv::OpBitcast, type_int, host_depth_offset));
+            // StorageBuffer since SPIR-V 1.3, but since SPIR-V 1.0 is
+            // generated, it's Uniform.
+            host_depth32 = builder.createUnaryOp(
+                spv::OpBitcast, type_float,
+                builder.createLoad(
+                    builder.createAccessChain(spv::StorageClassUniform,
+                                              host_depth_source_buffer,
+                                              id_vector_temp),
+                    spv::NoPrecision));
+          }
+          spv::Block* depth24_to_depth32_header = builder.getBuildPoint();
+          spv::Id depth24_to_depth32_convert_id = spv::NoResult;
+          spv::Block* depth24_to_depth32_merge = nullptr;
+          spv::Id host_depth24 = spv::NoResult;
+          if (host_depth32 != spv::NoResult) {
+            // Convert the host depth value to the guest format and check if it
+            // matches the value in the currently owning guest render target.
+            switch (dest_depth_format) {
+              case xenos::DepthRenderTargetFormat::kD24S8: {
+                // Round to the nearest even integer. This seems to be the
+                // correct, adding +0.5 and rounding towards zero results in red
+                // instead of black in the 4D5307E6 clear shader.
+                id_vector_temp.clear();
+                id_vector_temp.push_back(builder.createBinOp(
+                    spv::OpFMul, type_float, host_depth32,
+                    builder.makeFloatConstant(float(0xFFFFFF))));
+                host_depth24 = builder.createUnaryOp(
+                    spv::OpConvertFToU, type_uint,
+                    builder.createBuiltinCall(type_float, ext_inst_glsl_std_450,
+                                              GLSLstd450RoundEven,
+                                              id_vector_temp));
+              } break;
+              case xenos::DepthRenderTargetFormat::kD24FS8: {
+                host_depth24 = SpirvShaderTranslator::PreClampedDepthTo20e4(
+                    builder, host_depth32, true, ext_inst_glsl_std_450);
+              } break;
+            }
+            assert_true(host_depth24 != spv::NoResult);
+            // Update the header block pointer after the conversion (to avoid
+            // assuming that the conversion doesn't branch).
+            depth24_to_depth32_header = builder.getBuildPoint();
+            spv::Id host_depth_outdated = builder.createBinOp(
+                spv::OpINotEqual, type_bool, guest_depth24, host_depth24);
+            spv::Block& depth24_to_depth32_convert_entry =
+                builder.makeNewBlock();
+            {
+              spv::Block& depth24_to_depth32_merge_block =
+                  builder.makeNewBlock();
+              depth24_to_depth32_merge = &depth24_to_depth32_merge_block;
+            }
+            {
+              std::unique_ptr<spv::Instruction> depth24_to_depth32_merge_op =
+                  std::make_unique<spv::Instruction>(spv::OpSelectionMerge);
+              depth24_to_depth32_merge_op->addIdOperand(
+                  depth24_to_depth32_merge->getId());
+              depth24_to_depth32_merge_op->addImmediateOperand(
+                  spv::SelectionControlMaskNone);
+              builder.getBuildPoint()->addInstruction(
+                  std::move(depth24_to_depth32_merge_op));
+            }
+            builder.createConditionalBranch(host_depth_outdated,
+                                            &depth24_to_depth32_convert_entry,
+                                            depth24_to_depth32_merge);
+            builder.setBuildPoint(&depth24_to_depth32_convert_entry);
+          }
+          // Convert the guest 24-bit depth to float32 (in an open conditional
+          // if the host depth is also loaded).
+          spv::Id guest_depth32 = spv::NoResult;
+          switch (dest_depth_format) {
+            case xenos::DepthRenderTargetFormat::kD24S8: {
+              // Multiplying by 1.0 / 0xFFFFFF produces an incorrect result (for
+              // 0xC00000, for instance - which is 2_10_10_10 clear to 0001) -
+              // rescale from 0...0xFFFFFF to 0...0x1000000 doing what true
+              // float division followed by multiplication does (on x86-64 MSVC
+              // with default SSE rounding) - values starting from 0x800000
+              // become bigger by 1; then accurately bias the result's exponent.
+              guest_depth32 = builder.createBinOp(
+                  spv::OpFMul, type_float,
+                  builder.createUnaryOp(
+                      spv::OpConvertUToF, type_float,
+                      builder.createBinOp(
+                          spv::OpIAdd, type_uint, guest_depth24,
+                          builder.createBinOp(spv::OpShiftRightLogical,
+                                              type_uint, guest_depth24,
+                                              builder.makeUintConstant(23)))),
+                  builder.makeFloatConstant(1.0f / float(1 << 24)));
+            } break;
+            case xenos::DepthRenderTargetFormat::kD24FS8: {
+              guest_depth32 = SpirvShaderTranslator::Depth20e4To32(
+                  builder, guest_depth24, 0, true, false,
+                  ext_inst_glsl_std_450);
+            } break;
+          }
+          assert_true(guest_depth32 != spv::NoResult);
+          spv::Id fragment_depth32 = guest_depth32;
+          if (host_depth32 != spv::NoResult) {
+            assert_not_null(depth24_to_depth32_merge);
+            spv::Id depth24_to_depth32_result_block_id =
+                builder.getBuildPoint()->getId();
+            builder.createBranch(depth24_to_depth32_merge);
+            builder.setBuildPoint(depth24_to_depth32_merge);
+            id_vector_temp.clear();
+            id_vector_temp.reserve(4);
+            id_vector_temp.push_back(guest_depth32);
+            id_vector_temp.push_back(depth24_to_depth32_result_block_id);
+            id_vector_temp.push_back(host_depth32);
+            id_vector_temp.push_back(depth24_to_depth32_header->getId());
+            fragment_depth32 =
+                builder.createOp(spv::OpPhi, type_float, id_vector_temp);
+          }
+          builder.createStore(fragment_depth32, output_fragment_depth);
+        }
+      } break;
+      case TransferOutput::kStencilBit: {
+        if (packed) {
+          // Kill the sample if the needed stencil bit is not set.
+          assert_true(push_constants_member_stencil_mask != UINT32_MAX);
+          id_vector_temp.clear();
+          id_vector_temp.push_back(builder.makeIntConstant(
+              int32_t(push_constants_member_stencil_mask)));
+          spv::Id stencil_mask_constant = builder.createLoad(
+              builder.createAccessChain(spv::StorageClassPushConstant,
+                                        push_constants, id_vector_temp),
+              spv::NoPrecision);
+          spv::Id stencil_sample_passed = builder.createBinOp(
+              spv::OpINotEqual, type_bool,
+              builder.createBinOp(spv::OpBitwiseAnd, type_uint, packed,
+                                  stencil_mask_constant),
+              builder.makeUintConstant(0));
+          spv::Block& stencil_bit_kill_block = builder.makeNewBlock();
+          spv::Block& stencil_bit_merge_block = builder.makeNewBlock();
+          {
+            std::unique_ptr<spv::Instruction> stencil_bit_merge_op =
+                std::make_unique<spv::Instruction>(spv::OpSelectionMerge);
+            stencil_bit_merge_op->addIdOperand(stencil_bit_merge_block.getId());
+            stencil_bit_merge_op->addImmediateOperand(
+                spv::SelectionControlMaskNone);
+            builder.getBuildPoint()->addInstruction(
+                std::move(stencil_bit_merge_op));
+          }
+          builder.createConditionalBranch(stencil_sample_passed,
+                                          &stencil_bit_merge_block,
+                                          &stencil_bit_kill_block);
+          builder.setBuildPoint(&stencil_bit_kill_block);
+          builder.createNoResultOp(spv::OpKill);
+          builder.setBuildPoint(&stencil_bit_merge_block);
+        }
+      } break;
+    }
+  }
+
+  // End the main function and make it the entry point.
+  builder.leaveFunction();
+  builder.addExecutionMode(main_function, spv::ExecutionModeOriginUpperLeft);
+  if (output_fragment_depth != spv::NoResult) {
+    builder.addExecutionMode(main_function, spv::ExecutionModeDepthReplacing);
+  }
+  if (output_fragment_stencil_ref != spv::NoResult) {
+    builder.addExecutionMode(main_function,
+                             spv::ExecutionModeStencilRefReplacingEXT);
+  }
+  spv::Instruction* entry_point =
+      builder.addEntryPoint(spv::ExecutionModelFragment, main_function, "main");
+  for (spv::Id interface_id : main_interface) {
+    entry_point->addIdOperand(interface_id);
+  }
+
+  // Serialize the shader code.
+  std::vector<unsigned int> shader_code;
+  builder.dump(shader_code);
+
+  // Create the shader module, and store the handle even if creation fails not
+  // to try to create it again later.
+  VkShaderModule shader_module = ui::vulkan::util::CreateShaderModule(
+      provider, reinterpret_cast<const uint32_t*>(shader_code.data()),
+      sizeof(uint32_t) * shader_code.size());
+  if (shader_module == VK_NULL_HANDLE) {
+    XELOGE(
+        "VulkanRenderTargetCache: Failed to create the render target ownership "
+        "transfer shader 0x{:08X}",
+        key.key);
+  }
+  transfer_shaders_.emplace(key, shader_module);
+  return shader_module;
+}
+
+VkPipeline const* VulkanRenderTargetCache::GetTransferPipelines(
+    TransferPipelineKey key) {
+  auto pipeline_it = transfer_pipelines_.find(key);
+  if (pipeline_it != transfer_pipelines_.end()) {
+    return pipeline_it->second[0] != VK_NULL_HANDLE ? pipeline_it->second.data()
+                                                    : nullptr;
+  }
+
+  VkRenderPass render_pass = GetRenderPass(key.render_pass_key);
+  VkShaderModule fragment_shader_module = GetTransferShader(key.shader_key);
+  if (render_pass == VK_NULL_HANDLE ||
+      fragment_shader_module == VK_NULL_HANDLE) {
+    transfer_pipelines_.emplace(key, std::array<VkPipeline, 4>{});
+    return nullptr;
+  }
+
+  const TransferModeInfo& mode = kTransferModes[size_t(key.shader_key.mode)];
+
+  const ui::vulkan::VulkanProvider& provider =
+      command_processor_.GetVulkanProvider();
+  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
+  VkDevice device = provider.device();
+  const VkPhysicalDeviceFeatures& device_features = provider.device_features();
+
+  uint32_t dest_sample_count = uint32_t(1)
+                               << uint32_t(key.shader_key.dest_msaa_samples);
+  bool dest_is_masked_sample =
+      dest_sample_count > 1 && !device_features.sampleRateShading;
+
+  VkPipelineShaderStageCreateInfo shader_stages[2];
+  shader_stages[0].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+  shader_stages[0].pNext = nullptr;
+  shader_stages[0].flags = 0;
+  shader_stages[0].stage = VK_SHADER_STAGE_VERTEX_BIT;
+  shader_stages[0].module = transfer_passthrough_vertex_shader_;
+  shader_stages[0].pName = "main";
+  shader_stages[0].pSpecializationInfo = nullptr;
+  shader_stages[1].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+  shader_stages[1].pNext = nullptr;
+  shader_stages[1].flags = 0;
+  shader_stages[1].stage = VK_SHADER_STAGE_FRAGMENT_BIT;
+  shader_stages[1].module = fragment_shader_module;
+  shader_stages[1].pName = "main";
+  shader_stages[1].pSpecializationInfo = nullptr;
+  VkSpecializationMapEntry sample_id_specialization_map_entry;
+  uint32_t sample_id_specialization_constant;
+  VkSpecializationInfo sample_id_specialization_info;
+  if (dest_is_masked_sample) {
+    sample_id_specialization_map_entry.constantID = 0;
+    sample_id_specialization_map_entry.offset = 0;
+    sample_id_specialization_map_entry.size = sizeof(uint32_t);
+    sample_id_specialization_constant = 0;
+    sample_id_specialization_info.mapEntryCount = 1;
+    sample_id_specialization_info.pMapEntries =
+        &sample_id_specialization_map_entry;
+    sample_id_specialization_info.dataSize =
+        sizeof(sample_id_specialization_constant);
+    sample_id_specialization_info.pData = &sample_id_specialization_constant;
+    shader_stages[1].pSpecializationInfo = &sample_id_specialization_info;
+  }
+
+  VkVertexInputBindingDescription vertex_input_binding;
+  vertex_input_binding.binding = 0;
+  vertex_input_binding.stride = sizeof(float) * 2;
+  vertex_input_binding.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+  VkVertexInputAttributeDescription vertex_input_attribute;
+  vertex_input_attribute.location = 0;
+  vertex_input_attribute.binding = 0;
+  vertex_input_attribute.format = VK_FORMAT_R32G32_SFLOAT;
+  vertex_input_attribute.offset = 0;
+  VkPipelineVertexInputStateCreateInfo vertex_input_state;
+  vertex_input_state.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
+  vertex_input_state.pNext = nullptr;
+  vertex_input_state.flags = 0;
+  vertex_input_state.vertexBindingDescriptionCount = 1;
+  vertex_input_state.pVertexBindingDescriptions = &vertex_input_binding;
+  vertex_input_state.vertexAttributeDescriptionCount = 1;
+  vertex_input_state.pVertexAttributeDescriptions = &vertex_input_attribute;
+
+  VkPipelineInputAssemblyStateCreateInfo input_assembly_state;
+  input_assembly_state.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
+  input_assembly_state.pNext = nullptr;
+  input_assembly_state.flags = 0;
+  input_assembly_state.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
+  input_assembly_state.primitiveRestartEnable = VK_FALSE;
+
+  // Dynamic, to stay within maxViewportDimensions while preferring a
+  // power-of-two factor for converting from pixel coordinates to NDC for exact
+  // precision.
+  VkPipelineViewportStateCreateInfo viewport_state;
+  viewport_state.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
+  viewport_state.pNext = nullptr;
+  viewport_state.flags = 0;
+  viewport_state.viewportCount = 1;
+  viewport_state.pViewports = nullptr;
+  viewport_state.scissorCount = 1;
+  viewport_state.pScissors = nullptr;
+
+  VkPipelineRasterizationStateCreateInfo rasterization_state = {};
+  rasterization_state.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
+  rasterization_state.polygonMode = VK_POLYGON_MODE_FILL;
+  rasterization_state.cullMode = VK_CULL_MODE_NONE;
+  rasterization_state.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE;
+  rasterization_state.lineWidth = 1.0f;
+
+  // For samples other than the first, will be changed for the pipelines for
+  // other samples.
+  VkSampleMask sample_mask = UINT32_MAX;
+  VkPipelineMultisampleStateCreateInfo multisample_state = {};
+  multisample_state.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
+  multisample_state.rasterizationSamples =
+      (dest_sample_count == 2 && !msaa_2x_attachments_supported_)
+          ? VK_SAMPLE_COUNT_4_BIT
+          : VkSampleCountFlagBits(dest_sample_count);
+  if (dest_sample_count > 1) {
+    if (device_features.sampleRateShading) {
+      multisample_state.sampleShadingEnable = VK_TRUE;
+      multisample_state.minSampleShading = 1.0f;
+      if (dest_sample_count == 2 && !msaa_2x_attachments_supported_) {
+        // Emulating 2x MSAA as samples 0 and 3 of 4x MSAA when 2x is not
+        // supported.
+        sample_mask = 0b1001;
+      }
+    } else {
+      sample_mask = 0b1;
+    }
+    if (sample_mask != UINT32_MAX) {
+      multisample_state.pSampleMask = &sample_mask;
+    }
+  }
+
+  // Whether the depth / stencil state is used depends on the presence of a
+  // depth attachment in the render pass - but not making assumptions about
+  // whether the render pass contains any specific attachments, so setting up
+  // valid depth / stencil state unconditionally.
+  VkPipelineDepthStencilStateCreateInfo depth_stencil_state = {};
+  depth_stencil_state.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
+  if (mode.output == TransferOutput::kDepth) {
+    depth_stencil_state.depthTestEnable = VK_TRUE;
+    depth_stencil_state.depthWriteEnable = VK_TRUE;
+    depth_stencil_state.depthCompareOp = cvars::depth_transfer_not_equal_test
+                                             ? VK_COMPARE_OP_NOT_EQUAL
+                                             : VK_COMPARE_OP_ALWAYS;
+  }
+  if ((mode.output == TransferOutput::kDepth &&
+       provider.device_extensions().ext_shader_stencil_export) ||
+      mode.output == TransferOutput::kStencilBit) {
+    depth_stencil_state.stencilTestEnable = VK_TRUE;
+    depth_stencil_state.front.failOp = VK_STENCIL_OP_KEEP;
+    depth_stencil_state.front.passOp = VK_STENCIL_OP_REPLACE;
+    depth_stencil_state.front.depthFailOp = VK_STENCIL_OP_REPLACE;
+    // Using ALWAYS, not NOT_EQUAL, so depth writing is unaffected by stencil
+    // being different.
+    depth_stencil_state.front.compareOp = VK_COMPARE_OP_ALWAYS;
+    // Will be dynamic for stencil bit output.
+    depth_stencil_state.front.writeMask = UINT8_MAX;
+    depth_stencil_state.front.reference = UINT8_MAX;
+    depth_stencil_state.back = depth_stencil_state.front;
+  }
+
+  // Whether the color blend state is used depends on the presence of color
+  // attachments in the render pass - but not making assumptions about whether
+  // the render pass contains any specific attachments, so setting up valid
+  // color blend state unconditionally.
+  VkPipelineColorBlendAttachmentState
+      color_blend_attachments[xenos::kMaxColorRenderTargets] = {};
+  VkPipelineColorBlendStateCreateInfo color_blend_state = {};
+  color_blend_state.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
+  color_blend_state.attachmentCount =
+      32 - xe::lzcnt(key.render_pass_key.depth_and_color_used >> 1);
+  color_blend_state.pAttachments = color_blend_attachments;
+  if (mode.output == TransferOutput::kColor) {
+    if (device_features.independentBlend) {
+      // State the intention more explicitly.
+      color_blend_attachments[key.shader_key.dest_color_rt_index]
+          .colorWriteMask = VK_COLOR_COMPONENT_R_BIT |
+                            VK_COLOR_COMPONENT_G_BIT |
+                            VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT;
+    } else {
+      // The blend state for all attachments must be identical, but other render
+      // targets are not written to by the shader.
+      for (uint32_t i = 0; i < color_blend_state.attachmentCount; ++i) {
+        color_blend_attachments[i].colorWriteMask =
+            VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
+            VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT;
+      }
+    }
+  }
+
+  std::array<VkDynamicState, 3> dynamic_states;
+  VkPipelineDynamicStateCreateInfo dynamic_state;
+  dynamic_state.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO;
+  dynamic_state.pNext = nullptr;
+  dynamic_state.flags = 0;
+  dynamic_state.dynamicStateCount = 0;
+  dynamic_state.pDynamicStates = dynamic_states.data();
+  dynamic_states[dynamic_state.dynamicStateCount++] = VK_DYNAMIC_STATE_VIEWPORT;
+  dynamic_states[dynamic_state.dynamicStateCount++] = VK_DYNAMIC_STATE_SCISSOR;
+  if (mode.output == TransferOutput::kStencilBit) {
+    dynamic_states[dynamic_state.dynamicStateCount++] =
+        VK_DYNAMIC_STATE_STENCIL_WRITE_MASK;
+  }
+
+  std::array<VkPipeline, 4> pipelines{};
+  VkGraphicsPipelineCreateInfo pipeline_create_info;
+  pipeline_create_info.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;
+  pipeline_create_info.pNext = nullptr;
+  pipeline_create_info.flags = 0;
+  if (dest_is_masked_sample) {
+    pipeline_create_info.flags |= VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT;
+  }
+  pipeline_create_info.stageCount = uint32_t(xe::countof(shader_stages));
+  pipeline_create_info.pStages = shader_stages;
+  pipeline_create_info.pVertexInputState = &vertex_input_state;
+  pipeline_create_info.pInputAssemblyState = &input_assembly_state;
+  pipeline_create_info.pTessellationState = nullptr;
+  pipeline_create_info.pViewportState = &viewport_state;
+  pipeline_create_info.pRasterizationState = &rasterization_state;
+  pipeline_create_info.pMultisampleState = &multisample_state;
+  pipeline_create_info.pDepthStencilState = &depth_stencil_state;
+  pipeline_create_info.pColorBlendState = &color_blend_state;
+  pipeline_create_info.pDynamicState = &dynamic_state;
+  pipeline_create_info.layout =
+      transfer_pipeline_layouts_[size_t(mode.pipeline_layout)];
+  pipeline_create_info.renderPass = render_pass;
+  pipeline_create_info.subpass = 0;
+  pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE;
+  pipeline_create_info.basePipelineIndex = -1;
+  if (dfn.vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1,
+                                    &pipeline_create_info, nullptr,
+                                    &pipelines[0]) != VK_SUCCESS) {
+    XELOGE(
+        "VulkanRenderTargetCache: Failed to create the render target ownership "
+        "transfer pipeline for render pass 0x{:08X}, shader 0x{:08X}",
+        key.render_pass_key.key, key.shader_key.key);
+    transfer_pipelines_.emplace(key, std::array<VkPipeline, 4>{});
+    return nullptr;
+  }
+  if (dest_is_masked_sample) {
+    assert_true(multisample_state.pSampleMask == &sample_mask);
+    pipeline_create_info.flags = (pipeline_create_info.flags &
+                                  ~VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT) |
+                                 VK_PIPELINE_CREATE_DERIVATIVE_BIT;
+    pipeline_create_info.basePipelineHandle = pipelines[0];
+    for (uint32_t i = 1; i < dest_sample_count; ++i) {
+      // Emulating 2x MSAA as samples 0 and 3 of 4x MSAA when 2x is not
+      // supported.
+      uint32_t host_sample_index =
+          (dest_sample_count == 2 && !msaa_2x_attachments_supported_ && i == 1)
+              ? 3
+              : i;
+      sample_id_specialization_constant = host_sample_index;
+      sample_mask = uint32_t(1) << host_sample_index;
+      if (dfn.vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1,
+                                        &pipeline_create_info, nullptr,
+                                        &pipelines[i]) != VK_SUCCESS) {
+        XELOGE(
+            "VulkanRenderTargetCache: Failed to create the render target "
+            "ownership transfer pipeline for render pass 0x{:08X}, shader "
+            "0x{:08X}, sample {}",
+            key.render_pass_key.key, key.shader_key.key, i);
+        for (uint32_t j = 0; j < i; ++j) {
+          dfn.vkDestroyPipeline(device, pipelines[j], nullptr);
+        }
+        transfer_pipelines_.emplace(key, std::array<VkPipeline, 4>{});
+        return nullptr;
+      }
+    }
+  }
+  return transfer_pipelines_.emplace(key, pipelines).first->second.data();
+}
+
+void VulkanRenderTargetCache::PerformTransfersAndResolveClears(
+    uint32_t render_target_count, RenderTarget* const* render_targets,
+    const std::vector<Transfer>* render_target_transfers,
+    const uint64_t* render_target_resolve_clear_values,
+    const Transfer::Rectangle* resolve_clear_rectangle) {
+  assert_true(GetPath() == Path::kHostRenderTargets);
+
+  const ui::vulkan::VulkanProvider& provider =
+      command_processor_.GetVulkanProvider();
+  const VkPhysicalDeviceLimits& device_limits =
+      provider.device_properties().limits;
+  const VkPhysicalDeviceFeatures& device_features = provider.device_features();
+  bool shader_stencil_export =
+      provider.device_extensions().ext_shader_stencil_export;
+  uint64_t current_submission = command_processor_.GetCurrentSubmission();
+  DeferredCommandBuffer& command_buffer =
+      command_processor_.deferred_command_buffer();
+
+  bool resolve_clear_needed =
+      render_target_resolve_clear_values && resolve_clear_rectangle;
+  VkClearRect resolve_clear_rect;
+  if (resolve_clear_needed) {
+    // Assuming the rectangle is already clamped by the setup function from the
+    // common render target cache.
+    resolve_clear_rect.rect.offset.x =
+        int32_t(resolve_clear_rectangle->x_pixels * resolution_scale_x_);
+    resolve_clear_rect.rect.offset.y =
+        int32_t(resolve_clear_rectangle->y_pixels * resolution_scale_y_);
+    resolve_clear_rect.rect.extent.width =
+        resolve_clear_rectangle->width_pixels * resolution_scale_x_;
+    resolve_clear_rect.rect.extent.height =
+        resolve_clear_rectangle->height_pixels * resolution_scale_y_;
+    resolve_clear_rect.baseArrayLayer = 0;
+    resolve_clear_rect.layerCount = 1;
+  }
+
+  // Do host depth storing for the depth destination (assuming there can be only
+  // one depth destination) where depth destination == host depth source.
+  bool host_depth_store_set_up = false;
+  for (uint32_t i = 0; i < render_target_count; ++i) {
+    RenderTarget* dest_rt = render_targets[i];
+    if (!dest_rt) {
+      continue;
+    }
+    auto& dest_vulkan_rt = *static_cast<VulkanRenderTarget*>(dest_rt);
+    RenderTargetKey dest_rt_key = dest_vulkan_rt.key();
+    if (!dest_rt_key.is_depth) {
+      continue;
+    }
+    const std::vector<Transfer>& depth_transfers = render_target_transfers[i];
+    for (const Transfer& transfer : depth_transfers) {
+      if (transfer.host_depth_source != dest_rt) {
+        continue;
+      }
+      if (!host_depth_store_set_up) {
+        // Pipeline.
+        command_processor_.BindExternalComputePipeline(
+            host_depth_store_pipelines_[size_t(dest_rt_key.msaa_samples)]);
+        // Descriptor set bindings.
+        VkDescriptorSet host_depth_store_descriptor_sets[] = {
+            edram_storage_buffer_descriptor_set_,
+            dest_vulkan_rt.GetDescriptorSetTransferSource(),
+        };
+        command_buffer.CmdVkBindDescriptorSets(
+            VK_PIPELINE_BIND_POINT_COMPUTE, host_depth_store_pipeline_layout_,
+            0, uint32_t(xe::countof(host_depth_store_descriptor_sets)),
+            host_depth_store_descriptor_sets, 0, nullptr);
+        // Render target constant.
+        HostDepthStoreRenderTargetConstant
+            host_depth_store_render_target_constant =
+                GetHostDepthStoreRenderTargetConstant(
+                    dest_rt_key.pitch_tiles_at_32bpp,
+                    msaa_2x_attachments_supported_);
+        command_buffer.CmdVkPushConstants(
+            host_depth_store_pipeline_layout_, VK_SHADER_STAGE_COMPUTE_BIT,
+            uint32_t(offsetof(HostDepthStoreConstants, render_target)),
+            sizeof(host_depth_store_render_target_constant),
+            &host_depth_store_render_target_constant);
+        // Barriers - don't need to try to combine them with the rest of
+        // render target transfer barriers now - if this happens, after host
+        // depth storing, SHADER_READ -> DEPTH_STENCIL_ATTACHMENT_WRITE will be
+        // done anyway even in the best case, so it's not possible to have all
+        // the barriers in one place here.
+        UseEdramBuffer(EdramBufferUsage::kComputeWrite);
+        // Always transitioning both depth and stencil, not storing separate
+        // usage flags for depth and stencil.
+        command_processor_.PushImageMemoryBarrier(
+            dest_vulkan_rt.image(),
+            ui::vulkan::util::InitializeSubresourceRange(
+                VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT),
+            dest_vulkan_rt.current_stage_mask(),
+            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+            dest_vulkan_rt.current_access_mask(), VK_ACCESS_SHADER_READ_BIT,
+            dest_vulkan_rt.current_layout(),
+            VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+        dest_vulkan_rt.SetUsage(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                                VK_ACCESS_SHADER_READ_BIT,
+                                VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+        host_depth_store_set_up = true;
+      }
+      Transfer::Rectangle
+          transfer_rectangles[Transfer::kMaxRectanglesWithCutout];
+      uint32_t transfer_rectangle_count = transfer.GetRectangles(
+          dest_rt_key.base_tiles, dest_rt_key.pitch_tiles_at_32bpp,
+          dest_rt_key.msaa_samples, false, transfer_rectangles,
+          resolve_clear_rectangle);
+      assert_not_zero(transfer_rectangle_count);
+      HostDepthStoreRectangleConstant host_depth_store_rectangle_constant;
+      for (uint32_t j = 0; j < transfer_rectangle_count; ++j) {
+        uint32_t group_count_x, group_count_y;
+        GetHostDepthStoreRectangleInfo(
+            transfer_rectangles[j], dest_rt_key.msaa_samples,
+            host_depth_store_rectangle_constant, group_count_x, group_count_y);
+        command_buffer.CmdVkPushConstants(
+            host_depth_store_pipeline_layout_, VK_SHADER_STAGE_COMPUTE_BIT,
+            uint32_t(offsetof(HostDepthStoreConstants, rectangle)),
+            sizeof(host_depth_store_rectangle_constant),
+            &host_depth_store_rectangle_constant);
+        command_processor_.SubmitBarriers(true);
+        command_buffer.CmdVkDispatch(group_count_x, group_count_y, 1);
+        MarkEdramBufferModified();
+      }
+    }
+    break;
+  }
+
+  constexpr VkPipelineStageFlags kSourceStageMask =
+      VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+  constexpr VkAccessFlags kSourceAccessMask = VK_ACCESS_SHADER_READ_BIT;
+  constexpr VkImageLayout kSourceLayout =
+      VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+
+  // Try to insert as many barriers as possible in one place, hoping that in the
+  // best case (no cross-copying between current render targets), barriers will
+  // need to be only inserted here, not between transfers. In case of
+  // cross-copying, if the destination use is going to happen before the source
+  // use, choose the destination state, otherwise the source state - to match
+  // the order in which transfers will actually happen (otherwise there will be
+  // just a useless switch back and forth).
+  for (uint32_t i = 0; i < render_target_count; ++i) {
+    RenderTarget* dest_rt = render_targets[i];
+    if (!dest_rt) {
+      continue;
+    }
+    const std::vector<Transfer>& dest_transfers = render_target_transfers[i];
+    if (!resolve_clear_needed && dest_transfers.empty()) {
+      continue;
+    }
+    // Transition the destination, only if not going to be used as a source
+    // earlier.
+    bool dest_used_previously_as_source = false;
+    for (uint32_t j = 0; j < i; ++j) {
+      for (const Transfer& previous_transfer : render_target_transfers[j]) {
+        if (previous_transfer.source == dest_rt ||
+            previous_transfer.host_depth_source == dest_rt) {
+          dest_used_previously_as_source = true;
+          break;
+        }
+      }
+    }
+    if (!dest_used_previously_as_source) {
+      auto& dest_vulkan_rt = *static_cast<VulkanRenderTarget*>(dest_rt);
+      VkPipelineStageFlags dest_dst_stage_mask;
+      VkAccessFlags dest_dst_access_mask;
+      VkImageLayout dest_new_layout;
+      dest_vulkan_rt.GetDrawUsage(&dest_dst_stage_mask, &dest_dst_access_mask,
+                                  &dest_new_layout);
+      command_processor_.PushImageMemoryBarrier(
+          dest_vulkan_rt.image(),
+          ui::vulkan::util::InitializeSubresourceRange(
+              dest_vulkan_rt.key().is_depth
+                  ? (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)
+                  : VK_IMAGE_ASPECT_COLOR_BIT),
+          dest_vulkan_rt.current_stage_mask(), dest_dst_stage_mask,
+          dest_vulkan_rt.current_access_mask(), dest_dst_access_mask,
+          dest_vulkan_rt.current_layout(), dest_new_layout);
+      dest_vulkan_rt.SetUsage(dest_dst_stage_mask, dest_dst_access_mask,
+                              dest_new_layout);
+    }
+    // Transition the sources, only if not going to be used as destinations
+    // earlier.
+    for (const Transfer& transfer : dest_transfers) {
+      bool source_previously_used_as_dest = false;
+      bool host_depth_source_previously_used_as_dest = false;
+      for (uint32_t j = 0; j < i; ++j) {
+        if (render_target_transfers[j].empty()) {
+          continue;
+        }
+        const RenderTarget* previous_rt = render_targets[j];
+        if (transfer.source == previous_rt) {
+          source_previously_used_as_dest = true;
+        }
+        if (transfer.host_depth_source == previous_rt) {
+          host_depth_source_previously_used_as_dest = true;
+        }
+      }
+      if (!source_previously_used_as_dest) {
+        auto& source_vulkan_rt =
+            *static_cast<VulkanRenderTarget*>(transfer.source);
+        command_processor_.PushImageMemoryBarrier(
+            source_vulkan_rt.image(),
+            ui::vulkan::util::InitializeSubresourceRange(
+                source_vulkan_rt.key().is_depth
+                    ? (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)
+                    : VK_IMAGE_ASPECT_COLOR_BIT),
+            source_vulkan_rt.current_stage_mask(), kSourceStageMask,
+            source_vulkan_rt.current_access_mask(), kSourceAccessMask,
+            source_vulkan_rt.current_layout(), kSourceLayout);
+        source_vulkan_rt.SetUsage(kSourceStageMask, kSourceAccessMask,
+                                  kSourceLayout);
+      }
+      // transfer.host_depth_source == dest_rt means the EDRAM buffer will be
+      // used instead, no need to transition.
+      if (transfer.host_depth_source && transfer.host_depth_source != dest_rt &&
+          !host_depth_source_previously_used_as_dest) {
+        auto& host_depth_source_vulkan_rt =
+            *static_cast<VulkanRenderTarget*>(transfer.host_depth_source);
+        command_processor_.PushImageMemoryBarrier(
+            host_depth_source_vulkan_rt.image(),
+            ui::vulkan::util::InitializeSubresourceRange(
+                VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT),
+            host_depth_source_vulkan_rt.current_stage_mask(), kSourceStageMask,
+            host_depth_source_vulkan_rt.current_access_mask(),
+            kSourceAccessMask, host_depth_source_vulkan_rt.current_layout(),
+            kSourceLayout);
+        host_depth_source_vulkan_rt.SetUsage(kSourceStageMask,
+                                             kSourceAccessMask, kSourceLayout);
+      }
+    }
+  }
+  if (host_depth_store_set_up) {
+    // Will be reading copied host depth from the EDRAM buffer.
+    UseEdramBuffer(EdramBufferUsage::kFragmentRead);
+  }
+
+  // Perform the transfers and clears.
+
+  TransferPipelineLayoutIndex last_transfer_pipeline_layout_index =
+      TransferPipelineLayoutIndex::kCount;
+  uint32_t transfer_descriptor_sets_bound = 0;
+  uint32_t transfer_push_constants_set = 0;
+  VkDescriptorSet last_descriptor_set_host_depth_stencil_textures =
+      VK_NULL_HANDLE;
+  VkDescriptorSet last_descriptor_set_depth_stencil_textures = VK_NULL_HANDLE;
+  VkDescriptorSet last_descriptor_set_color_texture = VK_NULL_HANDLE;
+  TransferAddressConstant last_host_depth_address_constant;
+  TransferAddressConstant last_address_constant;
+
+  for (uint32_t i = 0; i < render_target_count; ++i) {
+    RenderTarget* dest_rt = render_targets[i];
+    if (!dest_rt) {
+      continue;
+    }
+
+    const std::vector<Transfer>& current_transfers = render_target_transfers[i];
+    if (current_transfers.empty() && !resolve_clear_needed) {
+      continue;
+    }
+
+    auto& dest_vulkan_rt = *static_cast<VulkanRenderTarget*>(dest_rt);
+    RenderTargetKey dest_rt_key = dest_vulkan_rt.key();
+
+    // Late barriers in case there was cross-copying that prevented merging of
+    // barriers.
+    {
+      VkPipelineStageFlags dest_dst_stage_mask;
+      VkAccessFlags dest_dst_access_mask;
+      VkImageLayout dest_new_layout;
+      dest_vulkan_rt.GetDrawUsage(&dest_dst_stage_mask, &dest_dst_access_mask,
+                                  &dest_new_layout);
+      command_processor_.PushImageMemoryBarrier(
+          dest_vulkan_rt.image(),
+          ui::vulkan::util::InitializeSubresourceRange(
+              dest_rt_key.is_depth
+                  ? (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)
+                  : VK_IMAGE_ASPECT_COLOR_BIT),
+          dest_vulkan_rt.current_stage_mask(), dest_dst_stage_mask,
+          dest_vulkan_rt.current_access_mask(), dest_dst_access_mask,
+          dest_vulkan_rt.current_layout(), dest_new_layout);
+      dest_vulkan_rt.SetUsage(dest_dst_stage_mask, dest_dst_access_mask,
+                              dest_new_layout);
+    }
+
+    // Get the objects needed for transfers to the destination.
+    // TODO(Triang3l): Reuse the guest render pass for transfers where possible
+    // (if the Vulkan format used for drawing is also usable for transfers - for
+    // instance, R8G8B8A8_UNORM can be used for both, so the guest pass can be
+    // reused, but R16G16B16A16_SFLOAT render targets use R16G16B16A16_UINT for
+    // transfers, so the transfer pass has to be separate) to avoid stores and
+    // loads on tile-based devices to make this actually applicable. Also
+    // overall perform all non-cross-copying transfers for the current
+    // framebuffer configuration in a single pass, to load / store only once.
+    RenderPassKey transfer_render_pass_key;
+    transfer_render_pass_key.msaa_samples = dest_rt_key.msaa_samples;
+    if (dest_rt_key.is_depth) {
+      transfer_render_pass_key.depth_and_color_used = 0b1;
+      transfer_render_pass_key.depth_format = dest_rt_key.GetDepthFormat();
+    } else {
+      transfer_render_pass_key.depth_and_color_used = 0b1 << 1;
+      transfer_render_pass_key.color_0_view_format =
+          dest_rt_key.GetColorFormat();
+      transfer_render_pass_key.color_rts_use_transfer_formats = 1;
+    }
+    VkRenderPass transfer_render_pass = GetRenderPass(transfer_render_pass_key);
+    if (transfer_render_pass == VK_NULL_HANDLE) {
+      continue;
+    }
+    const RenderTarget*
+        transfer_framebuffer_render_targets[1 + xenos::kMaxColorRenderTargets] =
+            {};
+    transfer_framebuffer_render_targets[dest_rt_key.is_depth ? 0 : 1] = dest_rt;
+    const Framebuffer* transfer_framebuffer = GetFramebuffer(
+        transfer_render_pass_key, dest_rt_key.pitch_tiles_at_32bpp,
+        transfer_framebuffer_render_targets);
+    if (!transfer_framebuffer) {
+      continue;
+    }
+    // Don't enter the render pass immediately - may still insert source
+    // barriers later.
+
+    if (!current_transfers.empty()) {
+      uint32_t dest_pitch_tiles = dest_rt_key.GetPitchTiles();
+      bool dest_is_64bpp = dest_rt_key.Is64bpp();
+
+      // Gather shader keys and sort to reduce pipeline state and binding
+      // switches. Also gather stencil rectangles to clear if needed.
+      bool need_stencil_bit_draws =
+          dest_rt_key.is_depth && !shader_stencil_export;
+      current_transfer_invocations_.clear();
+      current_transfer_invocations_.reserve(
+          current_transfers.size() << uint32_t(need_stencil_bit_draws));
+      uint32_t rt_sort_index = 0;
+      TransferShaderKey new_transfer_shader_key;
+      new_transfer_shader_key.dest_msaa_samples = dest_rt_key.msaa_samples;
+      new_transfer_shader_key.dest_resource_format =
+          dest_rt_key.resource_format;
+      uint32_t stencil_clear_rectangle_count = 0;
+      for (uint32_t j = 0; j <= uint32_t(need_stencil_bit_draws); ++j) {
+        // j == 0 - color or depth.
+        // j == 1 - stencil bits.
+        // Stencil bit writing always requires a different root signature,
+        // handle these separately. Stencil never has a host depth source.
+        // Clear previously set sort indices.
+        for (const Transfer& transfer : current_transfers) {
+          auto host_depth_source_vulkan_rt =
+              static_cast<VulkanRenderTarget*>(transfer.host_depth_source);
+          if (host_depth_source_vulkan_rt) {
+            host_depth_source_vulkan_rt->SetTemporarySortIndex(UINT32_MAX);
+          }
+          assert_not_null(transfer.source);
+          auto& source_vulkan_rt =
+              *static_cast<VulkanRenderTarget*>(transfer.source);
+          source_vulkan_rt.SetTemporarySortIndex(UINT32_MAX);
+        }
+        for (const Transfer& transfer : current_transfers) {
+          assert_not_null(transfer.source);
+          auto& source_vulkan_rt =
+              *static_cast<VulkanRenderTarget*>(transfer.source);
+          VulkanRenderTarget* host_depth_source_vulkan_rt =
+              j ? nullptr
+                : static_cast<VulkanRenderTarget*>(transfer.host_depth_source);
+          if (host_depth_source_vulkan_rt &&
+              host_depth_source_vulkan_rt->temporary_sort_index() ==
+                  UINT32_MAX) {
+            host_depth_source_vulkan_rt->SetTemporarySortIndex(rt_sort_index++);
+          }
+          if (source_vulkan_rt.temporary_sort_index() == UINT32_MAX) {
+            source_vulkan_rt.SetTemporarySortIndex(rt_sort_index++);
+          }
+          RenderTargetKey source_rt_key = source_vulkan_rt.key();
+          new_transfer_shader_key.source_msaa_samples =
+              source_rt_key.msaa_samples;
+          new_transfer_shader_key.source_resource_format =
+              source_rt_key.resource_format;
+          bool host_depth_source_is_copy =
+              host_depth_source_vulkan_rt == &dest_vulkan_rt;
+          // The host depth copy buffer has only raw samples.
+          new_transfer_shader_key.host_depth_source_msaa_samples =
+              (host_depth_source_vulkan_rt && !host_depth_source_is_copy)
+                  ? host_depth_source_vulkan_rt->key().msaa_samples
+                  : xenos::MsaaSamples::k1X;
+          if (j) {
+            new_transfer_shader_key.mode =
+                source_rt_key.is_depth ? TransferMode::kDepthToStencilBit
+                                       : TransferMode::kColorToStencilBit;
+            stencil_clear_rectangle_count +=
+                transfer.GetRectangles(dest_rt_key.base_tiles, dest_pitch_tiles,
+                                       dest_rt_key.msaa_samples, dest_is_64bpp,
+                                       nullptr, resolve_clear_rectangle);
+          } else {
+            if (dest_rt_key.is_depth) {
+              if (host_depth_source_vulkan_rt) {
+                if (host_depth_source_is_copy) {
+                  new_transfer_shader_key.mode =
+                      source_rt_key.is_depth
+                          ? TransferMode::kDepthAndHostDepthCopyToDepth
+                          : TransferMode::kColorAndHostDepthCopyToDepth;
+                } else {
+                  new_transfer_shader_key.mode =
+                      source_rt_key.is_depth
+                          ? TransferMode::kDepthAndHostDepthToDepth
+                          : TransferMode::kColorAndHostDepthToDepth;
+                }
+              } else {
+                new_transfer_shader_key.mode =
+                    source_rt_key.is_depth ? TransferMode::kDepthToDepth
+                                           : TransferMode::kColorToDepth;
+              }
+            } else {
+              new_transfer_shader_key.mode = source_rt_key.is_depth
+                                                 ? TransferMode::kDepthToColor
+                                                 : TransferMode::kColorToColor;
+            }
+          }
+          current_transfer_invocations_.emplace_back(transfer,
+                                                     new_transfer_shader_key);
+          if (j) {
+            current_transfer_invocations_.back().transfer.host_depth_source =
+                nullptr;
+          }
+        }
+      }
+      std::sort(current_transfer_invocations_.begin(),
+                current_transfer_invocations_.end());
+
+      for (auto it = current_transfer_invocations_.cbegin();
+           it != current_transfer_invocations_.cend(); ++it) {
+        assert_not_null(it->transfer.source);
+        auto& source_vulkan_rt =
+            *static_cast<VulkanRenderTarget*>(it->transfer.source);
+        command_processor_.PushImageMemoryBarrier(
+            source_vulkan_rt.image(),
+            ui::vulkan::util::InitializeSubresourceRange(
+                source_vulkan_rt.key().is_depth
+                    ? (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)
+                    : VK_IMAGE_ASPECT_COLOR_BIT),
+            source_vulkan_rt.current_stage_mask(), kSourceStageMask,
+            source_vulkan_rt.current_access_mask(), kSourceAccessMask,
+            source_vulkan_rt.current_layout(), kSourceLayout);
+        source_vulkan_rt.SetUsage(kSourceStageMask, kSourceAccessMask,
+                                  kSourceLayout);
+        auto host_depth_source_vulkan_rt =
+            static_cast<VulkanRenderTarget*>(it->transfer.host_depth_source);
+        if (host_depth_source_vulkan_rt) {
+          TransferShaderKey transfer_shader_key = it->shader_key;
+          if (transfer_shader_key.mode ==
+                  TransferMode::kDepthAndHostDepthCopyToDepth ||
+              transfer_shader_key.mode ==
+                  TransferMode::kColorAndHostDepthCopyToDepth) {
+            // Reading copied host depth from the EDRAM buffer.
+            UseEdramBuffer(EdramBufferUsage::kFragmentRead);
+          } else {
+            // Reading host depth from the texture.
+            command_processor_.PushImageMemoryBarrier(
+                host_depth_source_vulkan_rt->image(),
+                ui::vulkan::util::InitializeSubresourceRange(
+                    VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT),
+                host_depth_source_vulkan_rt->current_stage_mask(),
+                kSourceStageMask,
+                host_depth_source_vulkan_rt->current_access_mask(),
+                kSourceAccessMask,
+                host_depth_source_vulkan_rt->current_layout(), kSourceLayout);
+            host_depth_source_vulkan_rt->SetUsage(
+                kSourceStageMask, kSourceAccessMask, kSourceLayout);
+          }
+        }
+      }
+
+      // Perform the transfers for the render target.
+
+      command_processor_.SubmitBarriersAndEnterRenderTargetCacheRenderPass(
+          transfer_render_pass, transfer_framebuffer);
+
+      if (stencil_clear_rectangle_count) {
+        VkClearAttachment* stencil_clear_attachment;
+        VkClearRect* stencil_clear_rect_write_ptr;
+        command_buffer.CmdClearAttachmentsEmplace(1, stencil_clear_attachment,
+                                                  stencil_clear_rectangle_count,
+                                                  stencil_clear_rect_write_ptr);
+        stencil_clear_attachment->aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
+        stencil_clear_attachment->colorAttachment = 0;
+        stencil_clear_attachment->clearValue.depthStencil.depth = 0.0f;
+        stencil_clear_attachment->clearValue.depthStencil.stencil = 0;
+        for (const Transfer& transfer : current_transfers) {
+          Transfer::Rectangle transfer_stencil_clear_rectangles
+              [Transfer::kMaxRectanglesWithCutout];
+          uint32_t transfer_stencil_clear_rectangle_count =
+              transfer.GetRectangles(dest_rt_key.base_tiles, dest_pitch_tiles,
+                                     dest_rt_key.msaa_samples, dest_is_64bpp,
+                                     transfer_stencil_clear_rectangles,
+                                     resolve_clear_rectangle);
+          for (uint32_t j = 0; j < transfer_stencil_clear_rectangle_count;
+               ++j) {
+            const Transfer::Rectangle& stencil_clear_rectangle =
+                transfer_stencil_clear_rectangles[j];
+            stencil_clear_rect_write_ptr->rect.offset.x =
+                int32_t(stencil_clear_rectangle.x_pixels * resolution_scale_x_);
+            stencil_clear_rect_write_ptr->rect.offset.y =
+                int32_t(stencil_clear_rectangle.y_pixels * resolution_scale_y_);
+            stencil_clear_rect_write_ptr->rect.extent.width =
+                stencil_clear_rectangle.width_pixels * resolution_scale_x_;
+            stencil_clear_rect_write_ptr->rect.extent.height =
+                stencil_clear_rectangle.height_pixels * resolution_scale_y_;
+            stencil_clear_rect_write_ptr->baseArrayLayer = 0;
+            stencil_clear_rect_write_ptr->layerCount = 1;
+            ++stencil_clear_rect_write_ptr;
+          }
+        }
+      }
+
+      // Prefer power of two viewports for exact division by simply biasing the
+      // exponent.
+      VkViewport transfer_viewport;
+      transfer_viewport.x = 0.0f;
+      transfer_viewport.y = 0.0f;
+      transfer_viewport.width =
+          float(std::min(xe::next_pow2(transfer_framebuffer->host_extent.width),
+                         device_limits.maxViewportDimensions[0]));
+      transfer_viewport.height = float(
+          std::min(xe::next_pow2(transfer_framebuffer->host_extent.height),
+                   device_limits.maxViewportDimensions[1]));
+      transfer_viewport.minDepth = 0.0f;
+      transfer_viewport.maxDepth = 1.0f;
+      command_processor_.SetViewport(transfer_viewport);
+      float pixels_to_ndc_x = 2.0f / transfer_viewport.width;
+      float pixels_to_ndc_y = 2.0f / transfer_viewport.height;
+      VkRect2D transfer_scissor;
+      transfer_scissor.offset.x = 0;
+      transfer_scissor.offset.y = 0;
+      transfer_scissor.extent = transfer_framebuffer->host_extent;
+      command_processor_.SetScissor(transfer_scissor);
+
+      for (auto it = current_transfer_invocations_.cbegin();
+           it != current_transfer_invocations_.cend(); ++it) {
+        const TransferInvocation& transfer_invocation_first = *it;
+        // Will be merging transfers from the same source into one mesh.
+        auto it_merged_first = it, it_merged_last = it;
+        uint32_t transfer_rectangle_count =
+            transfer_invocation_first.transfer.GetRectangles(
+                dest_rt_key.base_tiles, dest_pitch_tiles,
+                dest_rt_key.msaa_samples, dest_is_64bpp, nullptr,
+                resolve_clear_rectangle);
+        for (auto it_merge = std::next(it_merged_first);
+             it_merge != current_transfer_invocations_.cend(); ++it_merge) {
+          if (!transfer_invocation_first.CanBeMergedIntoOneDraw(*it_merge)) {
+            break;
+          }
+          transfer_rectangle_count += it_merge->transfer.GetRectangles(
+              dest_rt_key.base_tiles, dest_pitch_tiles,
+              dest_rt_key.msaa_samples, dest_is_64bpp, nullptr,
+              resolve_clear_rectangle);
+          it_merged_last = it_merge;
+        }
+        assert_not_zero(transfer_rectangle_count);
+        // Skip the merged transfers in the subsequent iterations.
+        it = it_merged_last;
+
+        assert_not_null(it->transfer.source);
+        auto& source_vulkan_rt =
+            *static_cast<VulkanRenderTarget*>(it->transfer.source);
+        auto host_depth_source_vulkan_rt =
+            static_cast<VulkanRenderTarget*>(it->transfer.host_depth_source);
+        TransferShaderKey transfer_shader_key = it->shader_key;
+        const TransferModeInfo& transfer_mode_info =
+            kTransferModes[size_t(transfer_shader_key.mode)];
+        TransferPipelineLayoutIndex transfer_pipeline_layout_index =
+            transfer_mode_info.pipeline_layout;
+        const TransferPipelineLayoutInfo& transfer_pipeline_layout_info =
+            kTransferPipelineLayoutInfos[size_t(
+                transfer_pipeline_layout_index)];
+        uint32_t transfer_sample_pipeline_count =
+            device_features.sampleRateShading
+                ? 1
+                : uint32_t(1) << uint32_t(dest_rt_key.msaa_samples);
+        bool transfer_is_stencil_bit =
+            (transfer_pipeline_layout_info.used_push_constant_dwords &
+             kTransferUsedPushConstantDwordStencilMaskBit) != 0;
+
+        uint32_t transfer_vertex_count = 6 * transfer_rectangle_count;
+        VkBuffer transfer_vertex_buffer;
+        VkDeviceSize transfer_vertex_buffer_offset;
+        float* transfer_rectangle_write_ptr =
+            reinterpret_cast<float*>(transfer_vertex_buffer_pool_->Request(
+                current_submission, sizeof(float) * 2 * transfer_vertex_count,
+                sizeof(float), transfer_vertex_buffer,
+                transfer_vertex_buffer_offset));
+        if (!transfer_rectangle_write_ptr) {
+          continue;
+        }
+        for (auto it_merged = it_merged_first; it_merged <= it_merged_last;
+             ++it_merged) {
+          Transfer::Rectangle transfer_invocation_rectangles
+              [Transfer::kMaxRectanglesWithCutout];
+          uint32_t transfer_invocation_rectangle_count =
+              it_merged->transfer.GetRectangles(
+                  dest_rt_key.base_tiles, dest_pitch_tiles,
+                  dest_rt_key.msaa_samples, dest_is_64bpp,
+                  transfer_invocation_rectangles, resolve_clear_rectangle);
+          assert_not_zero(transfer_invocation_rectangle_count);
+          for (uint32_t j = 0; j < transfer_invocation_rectangle_count; ++j) {
+            const Transfer::Rectangle& transfer_rectangle =
+                transfer_invocation_rectangles[j];
+            float transfer_rectangle_x0 =
+                -1.0f + transfer_rectangle.x_pixels * pixels_to_ndc_x;
+            float transfer_rectangle_y0 =
+                -1.0f + transfer_rectangle.y_pixels * pixels_to_ndc_y;
+            float transfer_rectangle_x1 =
+                transfer_rectangle_x0 +
+                transfer_rectangle.width_pixels * pixels_to_ndc_x;
+            float transfer_rectangle_y1 =
+                transfer_rectangle_y0 +
+                transfer_rectangle.height_pixels * pixels_to_ndc_y;
+            // O-*
+            // |/
+            // *
+            *(transfer_rectangle_write_ptr++) = transfer_rectangle_x0;
+            *(transfer_rectangle_write_ptr++) = transfer_rectangle_y0;
+            // *-*
+            // |/
+            // O
+            *(transfer_rectangle_write_ptr++) = transfer_rectangle_x0;
+            *(transfer_rectangle_write_ptr++) = transfer_rectangle_y1;
+            // *-O
+            // |/
+            // *
+            *(transfer_rectangle_write_ptr++) = transfer_rectangle_x1;
+            *(transfer_rectangle_write_ptr++) = transfer_rectangle_y0;
+            //   O
+            //  /|
+            // *-*
+            *(transfer_rectangle_write_ptr++) = transfer_rectangle_x1;
+            *(transfer_rectangle_write_ptr++) = transfer_rectangle_y0;
+            //   *
+            //  /|
+            // O-*
+            *(transfer_rectangle_write_ptr++) = transfer_rectangle_x0;
+            *(transfer_rectangle_write_ptr++) = transfer_rectangle_y1;
+            //   *
+            //  /|
+            // *-O
+            *(transfer_rectangle_write_ptr++) = transfer_rectangle_x1;
+            *(transfer_rectangle_write_ptr++) = transfer_rectangle_y1;
+          }
+        }
+        command_buffer.CmdVkBindVertexBuffers(0, 1, &transfer_vertex_buffer,
+                                              &transfer_vertex_buffer_offset);
+
+        const VkPipeline* transfer_pipelines = GetTransferPipelines(
+            TransferPipelineKey(transfer_render_pass_key, transfer_shader_key));
+        if (!transfer_pipelines) {
+          continue;
+        }
+        command_processor_.BindExternalGraphicsPipeline(transfer_pipelines[0]);
+        if (last_transfer_pipeline_layout_index !=
+            transfer_pipeline_layout_index) {
+          last_transfer_pipeline_layout_index = transfer_pipeline_layout_index;
+          transfer_descriptor_sets_bound = 0;
+          transfer_push_constants_set = 0;
+        }
+
+        // Invalidate outdated bindings.
+        if (transfer_pipeline_layout_info.used_descriptor_sets &
+            kTransferUsedDescriptorSetHostDepthStencilTexturesBit) {
+          assert_not_null(host_depth_source_vulkan_rt);
+          VkDescriptorSet descriptor_set_host_depth_stencil_textures =
+              host_depth_source_vulkan_rt->GetDescriptorSetTransferSource();
+          if (last_descriptor_set_host_depth_stencil_textures !=
+              descriptor_set_host_depth_stencil_textures) {
+            last_descriptor_set_host_depth_stencil_textures =
+                descriptor_set_host_depth_stencil_textures;
+            transfer_descriptor_sets_bound &=
+                ~kTransferUsedDescriptorSetHostDepthStencilTexturesBit;
+          }
+        }
+        if (transfer_pipeline_layout_info.used_descriptor_sets &
+            kTransferUsedDescriptorSetDepthStencilTexturesBit) {
+          VkDescriptorSet descriptor_set_depth_stencil_textures =
+              source_vulkan_rt.GetDescriptorSetTransferSource();
+          if (last_descriptor_set_depth_stencil_textures !=
+              descriptor_set_depth_stencil_textures) {
+            last_descriptor_set_depth_stencil_textures =
+                descriptor_set_depth_stencil_textures;
+            transfer_descriptor_sets_bound &=
+                ~kTransferUsedDescriptorSetDepthStencilTexturesBit;
+          }
+        }
+        if (transfer_pipeline_layout_info.used_descriptor_sets &
+            kTransferUsedDescriptorSetColorTextureBit) {
+          VkDescriptorSet descriptor_set_color_texture =
+              source_vulkan_rt.GetDescriptorSetTransferSource();
+          if (last_descriptor_set_color_texture !=
+              descriptor_set_color_texture) {
+            last_descriptor_set_color_texture = descriptor_set_color_texture;
+            transfer_descriptor_sets_bound &=
+                ~kTransferUsedDescriptorSetColorTextureBit;
+          }
+        }
+        if (transfer_pipeline_layout_info.used_push_constant_dwords &
+            kTransferUsedPushConstantDwordHostDepthAddressBit) {
+          assert_not_null(host_depth_source_vulkan_rt);
+          RenderTargetKey host_depth_source_rt_key =
+              host_depth_source_vulkan_rt->key();
+          TransferAddressConstant host_depth_address_constant;
+          host_depth_address_constant.dest_pitch = dest_pitch_tiles;
+          host_depth_address_constant.source_pitch =
+              host_depth_source_rt_key.GetPitchTiles();
+          host_depth_address_constant.source_to_dest =
+              int32_t(dest_rt_key.base_tiles) -
+              int32_t(host_depth_source_rt_key.base_tiles);
+          if (last_host_depth_address_constant != host_depth_address_constant) {
+            last_host_depth_address_constant = host_depth_address_constant;
+            transfer_push_constants_set &=
+                ~kTransferUsedPushConstantDwordHostDepthAddressBit;
+          }
+        }
+        if (transfer_pipeline_layout_info.used_push_constant_dwords &
+            kTransferUsedPushConstantDwordAddressBit) {
+          RenderTargetKey source_rt_key = source_vulkan_rt.key();
+          TransferAddressConstant address_constant;
+          address_constant.dest_pitch = dest_pitch_tiles;
+          address_constant.source_pitch = source_rt_key.GetPitchTiles();
+          address_constant.source_to_dest = int32_t(dest_rt_key.base_tiles) -
+                                            int32_t(source_rt_key.base_tiles);
+          if (last_address_constant != address_constant) {
+            last_address_constant = address_constant;
+            transfer_push_constants_set &=
+                ~kTransferUsedPushConstantDwordAddressBit;
+          }
+        }
+
+        // Apply the new bindings.
+        // TODO(Triang3l): Merge binding updates into spans.
+        VkPipelineLayout transfer_pipeline_layout =
+            transfer_pipeline_layouts_[size_t(transfer_pipeline_layout_index)];
+        uint32_t transfer_descriptor_sets_unbound =
+            transfer_pipeline_layout_info.used_descriptor_sets &
+            ~transfer_descriptor_sets_bound;
+        if (transfer_descriptor_sets_unbound &
+            kTransferUsedDescriptorSetHostDepthBufferBit) {
+          command_buffer.CmdVkBindDescriptorSets(
+              VK_PIPELINE_BIND_POINT_GRAPHICS, transfer_pipeline_layout,
+              xe::bit_count(transfer_pipeline_layout_info.used_descriptor_sets &
+                            (kTransferUsedDescriptorSetHostDepthBufferBit - 1)),
+              1, &edram_storage_buffer_descriptor_set_, 0, nullptr);
+          transfer_descriptor_sets_bound |=
+              kTransferUsedDescriptorSetHostDepthBufferBit;
+        }
+        if (transfer_descriptor_sets_unbound &
+            kTransferUsedDescriptorSetHostDepthStencilTexturesBit) {
+          command_buffer.CmdVkBindDescriptorSets(
+              VK_PIPELINE_BIND_POINT_GRAPHICS, transfer_pipeline_layout,
+              xe::bit_count(
+                  transfer_pipeline_layout_info.used_descriptor_sets &
+                  (kTransferUsedDescriptorSetHostDepthStencilTexturesBit - 1)),
+              1, &last_descriptor_set_host_depth_stencil_textures, 0, nullptr);
+          transfer_descriptor_sets_bound |=
+              kTransferUsedDescriptorSetHostDepthStencilTexturesBit;
+        }
+        if (transfer_descriptor_sets_unbound &
+            kTransferUsedDescriptorSetDepthStencilTexturesBit) {
+          command_buffer.CmdVkBindDescriptorSets(
+              VK_PIPELINE_BIND_POINT_GRAPHICS, transfer_pipeline_layout,
+              xe::bit_count(
+                  transfer_pipeline_layout_info.used_descriptor_sets &
+                  (kTransferUsedDescriptorSetDepthStencilTexturesBit - 1)),
+              1, &last_descriptor_set_depth_stencil_textures, 0, nullptr);
+          transfer_descriptor_sets_bound |=
+              kTransferUsedDescriptorSetDepthStencilTexturesBit;
+        }
+        if (transfer_descriptor_sets_unbound &
+            kTransferUsedDescriptorSetColorTextureBit) {
+          command_buffer.CmdVkBindDescriptorSets(
+              VK_PIPELINE_BIND_POINT_GRAPHICS, transfer_pipeline_layout,
+              xe::bit_count(transfer_pipeline_layout_info.used_descriptor_sets &
+                            (kTransferUsedDescriptorSetColorTextureBit - 1)),
+              1, &last_descriptor_set_color_texture, 0, nullptr);
+          transfer_descriptor_sets_bound |=
+              kTransferUsedDescriptorSetColorTextureBit;
+        }
+        uint32_t transfer_push_constants_unset =
+            transfer_pipeline_layout_info.used_push_constant_dwords &
+            ~transfer_push_constants_set;
+        if (transfer_push_constants_unset &
+            kTransferUsedPushConstantDwordHostDepthAddressBit) {
+          command_buffer.CmdVkPushConstants(
+              transfer_pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT,
+              sizeof(uint32_t) *
+                  xe::bit_count(
+                      transfer_pipeline_layout_info.used_push_constant_dwords &
+                      (kTransferUsedPushConstantDwordHostDepthAddressBit - 1)),
+              sizeof(uint32_t), &last_host_depth_address_constant);
+          transfer_push_constants_set |=
+              kTransferUsedPushConstantDwordHostDepthAddressBit;
+        }
+        if (transfer_push_constants_unset &
+            kTransferUsedPushConstantDwordAddressBit) {
+          command_buffer.CmdVkPushConstants(
+              transfer_pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT,
+              sizeof(uint32_t) *
+                  xe::bit_count(
+                      transfer_pipeline_layout_info.used_push_constant_dwords &
+                      (kTransferUsedPushConstantDwordAddressBit - 1)),
+              sizeof(uint32_t), &last_address_constant);
+          transfer_push_constants_set |=
+              kTransferUsedPushConstantDwordAddressBit;
+        }
+
+        for (uint32_t j = 0; j < transfer_sample_pipeline_count; ++j) {
+          if (j) {
+            command_processor_.BindExternalGraphicsPipeline(
+                transfer_pipelines[j]);
+          }
+          for (uint32_t k = 0; k < uint32_t(transfer_is_stencil_bit ? 8 : 1);
+               ++k) {
+            if (transfer_is_stencil_bit) {
+              uint32_t transfer_stencil_bit = uint32_t(1) << k;
+              command_buffer.CmdVkPushConstants(
+                  transfer_pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT,
+                  sizeof(uint32_t) *
+                      xe::bit_count(
+                          transfer_pipeline_layout_info
+                              .used_push_constant_dwords &
+                          (kTransferUsedPushConstantDwordStencilMaskBit - 1)),
+                  sizeof(uint32_t), &transfer_stencil_bit);
+              command_buffer.CmdVkSetStencilWriteMask(
+                  VK_STENCIL_FACE_FRONT_AND_BACK, transfer_stencil_bit);
+            }
+            command_buffer.CmdVkDraw(transfer_vertex_count, 1, 0, 0);
+          }
+        }
+      }
+    }
+
+    // Perform the clear.
+    if (resolve_clear_needed) {
+      command_processor_.SubmitBarriersAndEnterRenderTargetCacheRenderPass(
+          transfer_render_pass, transfer_framebuffer);
+      VkClearAttachment resolve_clear_attachment;
+      resolve_clear_attachment.colorAttachment = 0;
+      std::memset(&resolve_clear_attachment.clearValue, 0,
+                  sizeof(resolve_clear_attachment.clearValue));
+      uint64_t clear_value = render_target_resolve_clear_values[i];
+      if (dest_rt_key.is_depth) {
+        resolve_clear_attachment.aspectMask =
+            VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+        uint32_t depth_guest_clear_value =
+            (uint32_t(clear_value) >> 8) & 0xFFFFFF;
+        switch (dest_rt_key.GetDepthFormat()) {
+          case xenos::DepthRenderTargetFormat::kD24S8:
+            resolve_clear_attachment.clearValue.depthStencil.depth =
+                xenos::UNorm24To32(depth_guest_clear_value);
+            break;
+          case xenos::DepthRenderTargetFormat::kD24FS8:
+            // Taking [0, 2) -> [0, 1) remapping into account.
+            resolve_clear_attachment.clearValue.depthStencil.depth =
+                xenos::Float20e4To32(depth_guest_clear_value) * 0.5f;
+            break;
+        }
+        resolve_clear_attachment.clearValue.depthStencil.stencil =
+            uint32_t(clear_value) & 0xFF;
+      } else {
+        resolve_clear_attachment.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+        switch (dest_rt_key.GetColorFormat()) {
+          case xenos::ColorRenderTargetFormat::k_8_8_8_8:
+          case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: {
+            for (uint32_t j = 0; j < 4; ++j) {
+              resolve_clear_attachment.clearValue.color.float32[j] =
+                  ((clear_value >> (j * 8)) & 0xFF) * (1.0f / 0xFF);
+            }
+          } break;
+          case xenos::ColorRenderTargetFormat::k_2_10_10_10:
+          case xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: {
+            for (uint32_t j = 0; j < 3; ++j) {
+              resolve_clear_attachment.clearValue.color.float32[j] =
+                  ((clear_value >> (j * 10)) & 0x3FF) * (1.0f / 0x3FF);
+            }
+            resolve_clear_attachment.clearValue.color.float32[3] =
+                ((clear_value >> 30) & 0x3) * (1.0f / 0x3);
+          } break;
+          case xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
+          case xenos::ColorRenderTargetFormat::
+              k_2_10_10_10_FLOAT_AS_16_16_16_16: {
+            for (uint32_t j = 0; j < 3; ++j) {
+              resolve_clear_attachment.clearValue.color.float32[j] =
+                  xenos::Float7e3To32((clear_value >> (j * 10)) & 0x3FF);
+            }
+            resolve_clear_attachment.clearValue.color.float32[3] =
+                ((clear_value >> 30) & 0x3) * (1.0f / 0x3);
+          } break;
+          case xenos::ColorRenderTargetFormat::k_16_16:
+          case xenos::ColorRenderTargetFormat::k_16_16_FLOAT: {
+            // Using uint for transfers and clears of both. Disregarding the
+            // current -32...32 vs. -1...1 settings for consistency with color
+            // clear via depth aliasing.
+            // TODO(Triang3l): Handle cases of unsupported multisampled 16_UINT
+            // and completely unsupported 16_UNORM.
+            for (uint32_t j = 0; j < 2; ++j) {
+              resolve_clear_attachment.clearValue.color.uint32[j] =
+                  uint32_t(clear_value >> (j * 16)) & 0xFFFF;
+            }
+          } break;
+          case xenos::ColorRenderTargetFormat::k_16_16_16_16:
+          case xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT: {
+            // Using uint for transfers and clears of both. Disregarding the
+            // current -32...32 vs. -1...1 settings for consistency with color
+            // clear via depth aliasing.
+            // TODO(Triang3l): Handle cases of unsupported multisampled 16_UINT
+            // and completely unsupported 16_UNORM.
+            for (uint32_t j = 0; j < 4; ++j) {
+              resolve_clear_attachment.clearValue.color.uint32[j] =
+                  uint32_t(clear_value >> (j * 16)) & 0xFFFF;
+            }
+          } break;
+          case xenos::ColorRenderTargetFormat::k_32_FLOAT: {
+            // Using uint for proper denormal and NaN handling.
+            resolve_clear_attachment.clearValue.color.uint32[0] =
+                uint32_t(clear_value);
+          } break;
+          case xenos::ColorRenderTargetFormat::k_32_32_FLOAT: {
+            // Using uint for proper denormal and NaN handling.
+            resolve_clear_attachment.clearValue.color.uint32[0] =
+                uint32_t(clear_value);
+            resolve_clear_attachment.clearValue.color.uint32[1] =
+                uint32_t(clear_value >> 32);
+          } break;
+        }
+      }
+      command_buffer.CmdVkClearAttachments(1, &resolve_clear_attachment, 1,
+                                           &resolve_clear_rect);
+    }
+  }
+}
+
 }  // namespace vulkan
 }  // namespace gpu
 }  // namespace xe
diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h
index 97bb690af..c98da4974 100644
--- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h
+++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h
@@ -10,13 +10,20 @@
 #ifndef XENIA_GPU_VULKAN_VULKAN_RENDER_TARGET_CACHE_H_
 #define XENIA_GPU_VULKAN_VULKAN_RENDER_TARGET_CACHE_H_
 
+#include <array>
 #include <cstdint>
 #include <cstring>
+#include <functional>
+#include <memory>
 #include <unordered_map>
 
 #include "xenia/base/hash.h"
+#include "xenia/base/xxhash.h"
 #include "xenia/gpu/render_target_cache.h"
+#include "xenia/gpu/xenos.h"
+#include "xenia/ui/vulkan/single_layout_descriptor_set_pool.h"
 #include "xenia/ui/vulkan/vulkan_provider.h"
+#include "xenia/ui/vulkan/vulkan_upload_buffer_pool.h"
 
 namespace xe {
 namespace gpu {
@@ -28,8 +35,12 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
  public:
   union RenderPassKey {
     struct {
-      // If emulating 2x as 4x, set this to 4x for 2x not to create unnecessary
-      // render pass objects.
+      // If emulating 2x as 4x, this is still 2x for simplicity of using this
+      // field to make guest-related decisions. Render pass objects are not very
+      // expensive, and their dependencies can't be shared between 2x-as-4x and
+      // true 4x MSAA passes (framebuffers because render target cache render
+      // targets are different for 2x and 4x guest MSAA, pipelines because the
+      // sample mask will have 2 samples excluded for 2x-as-4x).
       xenos::MsaaSamples msaa_samples : xenos::kMsaaSamplesBits;  // 2
       // << 0 is depth, << 1...4 is color.
       uint32_t depth_and_color_used : 1 + xenos::kMaxColorRenderTargets;  // 7
@@ -46,7 +57,8 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
       xenos::ColorRenderTargetFormat color_2_view_format
           : xenos::kColorRenderTargetFormatBits;  // 20
       xenos::ColorRenderTargetFormat color_3_view_format
-          : xenos::kColorRenderTargetFormatBits;  // 24
+          : xenos::kColorRenderTargetFormatBits;    // 24
+      uint32_t color_rts_use_transfer_formats : 1;  // 25
     };
     uint32_t key = 0;
     struct Hasher {
@@ -60,6 +72,9 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
     bool operator!=(const RenderPassKey& other_key) const {
       return !(*this == other_key);
     }
+    bool operator<(const RenderPassKey& other_key) const {
+      return key < other_key.key;
+    }
   };
   static_assert_size(RenderPassKey, sizeof(uint32_t));
 
@@ -78,12 +93,14 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
   void Shutdown(bool from_destructor = false);
   void ClearCache() override;
 
-  // TOOD(Triang3l): Fragment shader interlock.
+  void CompletedSubmissionUpdated();
+  void EndSubmission();
+
+  // TODO(Triang3l): Fragment shader interlock.
   Path GetPath() const override { return Path::kHostRenderTargets; }
 
-  // TODO(Triang3l): Resolution scaling.
-  uint32_t GetResolutionScaleX() const override { return 1; }
-  uint32_t GetResolutionScaleY() const override { return 1; }
+  uint32_t GetResolutionScaleX() const override { return resolution_scale_x_; }
+  uint32_t GetResolutionScaleY() const override { return resolution_scale_y_; }
 
   bool Update(bool is_rasterization_done,
               uint32_t shader_writes_color_targets) override;
@@ -98,6 +115,17 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
     return last_update_framebuffer_;
   }
 
+  bool msaa_2x_attachments_supported() const {
+    return msaa_2x_attachments_supported_;
+  }
+  bool msaa_2x_no_attachments_supported() const {
+    return msaa_2x_no_attachments_supported_;
+  }
+  bool IsMsaa2xSupported(bool subpass_has_attachments) const {
+    return subpass_has_attachments ? msaa_2x_attachments_supported_
+                                   : msaa_2x_no_attachments_supported_;
+  }
+
   // Returns the render pass object, or VK_NULL_HANDLE if failed to create.
   // A render pass managed by the render target cache may be ended and resumed
   // at any time (to allow for things like copying and texture loading).
@@ -110,6 +138,99 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
       bool* is_integer_out = nullptr) const;
 
  protected:
+  uint32_t GetMaxRenderTargetWidth() const override;
+  uint32_t GetMaxRenderTargetHeight() const override;
+
+  RenderTarget* CreateRenderTarget(RenderTargetKey key) override;
+
+  // TODO(Triang3l): Check actual unorm24 support.
+  bool IsHostDepthEncodingDifferent(
+      xenos::DepthRenderTargetFormat format) const override {
+    return true;
+  }
+
+ private:
+  enum class EdramBufferUsage {
+    // There's no need for combined fragment and compute usages.
+    // With host render targets, the usual usage sequence is as follows:
+    // - Optionally compute writes - host depth copy storing for EDRAM range
+    //   ownership transfers.
+    // - Optionally fragment reads - host depth copy storing for EDRAM range
+    //   ownership transfers.
+    // - Compute writes - copying from host render targets during resolving.
+    // - Compute reads - writing to the shared memory during resolving.
+    // With the render backend implementation based on fragment shader
+    // interlocks, it's:
+    // - Fragment reads and writes - depth / stencil and color operations.
+    // - Compute reads - writing to the shared memory during resolving.
+    // So, fragment reads and compute reads normally don't follow each other,
+    // and there's no need to amortize the cost of a read > read barrier in an
+    // exceptional situation by using a wider barrier in the normal scenario.
+
+    // Host depth copy storing.
+    kFragmentRead,
+    // Fragment shader interlock depth / stencil and color operations.
+    kFragmentReadWrite,
+    // Resolve - copying to the shared memory.
+    kComputeRead,
+    // Resolve - copying from host render targets.
+    kComputeWrite,
+    // Trace recording.
+    kTransferRead,
+    // Trace playback.
+    kTransferWrite,
+  };
+  enum class EdramBufferModificationStatus {
+    // The values are ordered by how strong the barrier conditions are.
+    // No uncommitted shader writes.
+    kUnmodified,
+    // Need to commit before the next fragment shader interlock usage with
+    // overlap.
+    kViaFragmentShaderInterlock,
+    // Need to commit before any next fragment shader interlock usage.
+    kViaUnordered,
+  };
+  static void GetEdramBufferUsageMasks(EdramBufferUsage usage,
+                                       VkPipelineStageFlags& stage_mask_out,
+                                       VkAccessFlags& access_mask_out);
+  void UseEdramBuffer(EdramBufferUsage new_usage);
+  void MarkEdramBufferModified(
+      EdramBufferModificationStatus modification_status =
+          EdramBufferModificationStatus::kViaUnordered);
+  void CommitEdramBufferShaderWrites(
+      EdramBufferModificationStatus commit_status =
+          EdramBufferModificationStatus::kViaFragmentShaderInterlock);
+
+  VulkanCommandProcessor& command_processor_;
+
+  uint32_t resolution_scale_x_ = 1;
+  uint32_t resolution_scale_y_ = 1;
+
+  // Accessible in fragment and compute shaders.
+  VkDescriptorSetLayout descriptor_set_layout_storage_buffer_ = VK_NULL_HANDLE;
+  VkDescriptorSetLayout descriptor_set_layout_sampled_image_ = VK_NULL_HANDLE;
+  VkDescriptorSetLayout descriptor_set_layout_sampled_image_x2_ =
+      VK_NULL_HANDLE;
+
+  std::unique_ptr<ui::vulkan::SingleLayoutDescriptorSetPool>
+      descriptor_set_pool_sampled_image_;
+  std::unique_ptr<ui::vulkan::SingleLayoutDescriptorSetPool>
+      descriptor_set_pool_sampled_image_x2_;
+
+  VkDeviceMemory edram_buffer_memory_ = VK_NULL_HANDLE;
+  VkBuffer edram_buffer_ = VK_NULL_HANDLE;
+  EdramBufferUsage edram_buffer_usage_;
+  EdramBufferModificationStatus edram_buffer_modification_status_ =
+      EdramBufferModificationStatus::kUnmodified;
+  VkDescriptorPool edram_storage_buffer_descriptor_pool_ = VK_NULL_HANDLE;
+  VkDescriptorSet edram_storage_buffer_descriptor_set_;
+
+  // RenderPassKey::key -> VkRenderPass.
+  // VK_NULL_HANDLE if failed to create.
+  std::unordered_map<uint32_t, VkRenderPass> render_passes_;
+
+  // For host render targets.
+
   // Can only be destroyed when framebuffers referencing it are destroyed!
   class VulkanRenderTarget final : public RenderTarget {
    public:
@@ -131,27 +252,45 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
 
     // Takes ownership of the Vulkan objects passed to the constructor.
     VulkanRenderTarget(RenderTargetKey key,
-                       const ui::vulkan::VulkanProvider& provider,
+                       VulkanRenderTargetCache& render_target_cache,
                        VkImage image, VkDeviceMemory memory,
                        VkImageView view_depth_color,
                        VkImageView view_depth_stencil, VkImageView view_stencil,
                        VkImageView view_srgb,
-                       VkImageView view_color_transfer_separate)
+                       VkImageView view_color_transfer_separate,
+                       size_t descriptor_set_index_transfer_source)
         : RenderTarget(key),
-          provider_(provider),
+          render_target_cache_(render_target_cache),
           image_(image),
           memory_(memory),
           view_depth_color_(view_depth_color),
           view_depth_stencil_(view_depth_stencil),
           view_stencil_(view_stencil),
           view_srgb_(view_srgb),
-          view_color_transfer_separate_(view_color_transfer_separate) {}
+          view_color_transfer_separate_(view_color_transfer_separate),
+          descriptor_set_index_transfer_source_(
+              descriptor_set_index_transfer_source) {}
     ~VulkanRenderTarget();
 
     VkImage image() const { return image_; }
 
     VkImageView view_depth_color() const { return view_depth_color_; }
     VkImageView view_depth_stencil() const { return view_depth_stencil_; }
+    VkImageView view_color_transfer_separate() const {
+      return view_color_transfer_separate_;
+    }
+    VkImageView view_color_transfer() const {
+      return view_color_transfer_separate_ != VK_NULL_HANDLE
+                 ? view_color_transfer_separate_
+                 : view_depth_color_;
+    }
+    VkDescriptorSet GetDescriptorSetTransferSource() const {
+      ui::vulkan::SingleLayoutDescriptorSetPool& descriptor_set_pool =
+          key().is_depth
+              ? *render_target_cache_.descriptor_set_pool_sampled_image_x2_
+              : *render_target_cache_.descriptor_set_pool_sampled_image_;
+      return descriptor_set_pool.Get(descriptor_set_index_transfer_source_);
+    }
 
     static void GetDrawUsage(bool is_depth,
                              VkPipelineStageFlags* stage_mask_out,
@@ -185,8 +324,13 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
       current_layout_ = layout;
     }
 
+    uint32_t temporary_sort_index() const { return temporary_sort_index_; }
+    void SetTemporarySortIndex(uint32_t index) {
+      temporary_sort_index_ = index;
+    }
+
    private:
-    const ui::vulkan::VulkanProvider& provider_;
+    VulkanRenderTargetCache& render_target_cache_;
 
     VkImage image_;
     VkDeviceMemory memory_;
@@ -200,30 +344,17 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
     VkImageView view_srgb_;
     VkImageView view_color_transfer_separate_;
 
+    // 2 sampled images for depth / stencil, 1 sampled image for color.
+    size_t descriptor_set_index_transfer_source_;
+
     VkPipelineStageFlags current_stage_mask_ = 0;
     VkAccessFlags current_access_mask_ = 0;
     VkImageLayout current_layout_ = VK_IMAGE_LAYOUT_UNDEFINED;
+
+    // Temporary storage for indices in operations like transfers and dumps.
+    uint32_t temporary_sort_index_ = 0;
   };
 
-  uint32_t GetMaxRenderTargetWidth() const override;
-  uint32_t GetMaxRenderTargetHeight() const override;
-
-  RenderTarget* CreateRenderTarget(RenderTargetKey key) override;
-
-  // TODO(Triang3l): Check actual unorm24 support.
-  bool IsHostDepthEncodingDifferent(
-      xenos::DepthRenderTargetFormat format) const override {
-    return true;
-  }
-
- private:
-  VulkanCommandProcessor& command_processor_;
-
-  // RenderPassKey::key -> VkRenderPass.
-  std::unordered_map<uint32_t, VkRenderPass> render_passes_;
-
-  // For host render targets.
-
   struct FramebufferKey {
     RenderPassKey render_pass_key;
 
@@ -254,13 +385,276 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
     void Reset() { std::memset(this, 0, sizeof(*this)); }
   };
 
+  enum TransferUsedDescriptorSet : uint32_t {
+    // Ordered from the least to the most frequently changed.
+    kTransferUsedDescriptorSetHostDepthBuffer,
+    kTransferUsedDescriptorSetHostDepthStencilTextures,
+    kTransferUsedDescriptorSetDepthStencilTextures,
+    // Mutually exclusive with kTransferUsedDescriptorSetDepthStencilTextures.
+    kTransferUsedDescriptorSetColorTexture,
+
+    kTransferUsedDescriptorSetCount,
+
+    kTransferUsedDescriptorSetHostDepthBufferBit =
+        uint32_t(1) << kTransferUsedDescriptorSetHostDepthBuffer,
+    kTransferUsedDescriptorSetHostDepthStencilTexturesBit =
+        uint32_t(1) << kTransferUsedDescriptorSetHostDepthStencilTextures,
+    kTransferUsedDescriptorSetDepthStencilTexturesBit =
+        uint32_t(1) << kTransferUsedDescriptorSetDepthStencilTextures,
+    kTransferUsedDescriptorSetColorTextureBit =
+        uint32_t(1) << kTransferUsedDescriptorSetColorTexture,
+  };
+
+  // 32-bit push constants (for simplicity of size calculation and to avoid
+  // std140 packing issues).
+  enum TransferUsedPushConstantDword : uint32_t {
+    kTransferUsedPushConstantDwordHostDepthAddress,
+    kTransferUsedPushConstantDwordAddress,
+    // Changed 8 times per transfer.
+    kTransferUsedPushConstantDwordStencilMask,
+
+    kTransferUsedPushConstantDwordCount,
+
+    kTransferUsedPushConstantDwordHostDepthAddressBit =
+        uint32_t(1) << kTransferUsedPushConstantDwordHostDepthAddress,
+    kTransferUsedPushConstantDwordAddressBit =
+        uint32_t(1) << kTransferUsedPushConstantDwordAddress,
+    kTransferUsedPushConstantDwordStencilMaskBit =
+        uint32_t(1) << kTransferUsedPushConstantDwordStencilMask,
+  };
+
+  enum class TransferPipelineLayoutIndex {
+    kColor,
+    kDepth,
+    kColorToStencilBit,
+    kDepthToStencilBit,
+    kColorAndHostDepthTexture,
+    kColorAndHostDepthBuffer,
+    kDepthAndHostDepthTexture,
+    kDepthAndHostDepthBuffer,
+
+    kCount,
+  };
+
+  struct TransferPipelineLayoutInfo {
+    uint32_t used_descriptor_sets;
+    uint32_t used_push_constant_dwords;
+  };
+
+  static const TransferPipelineLayoutInfo
+      kTransferPipelineLayoutInfos[size_t(TransferPipelineLayoutIndex::kCount)];
+
+  enum class TransferMode : uint32_t {
+    kColorToDepth,
+    kColorToColor,
+
+    kDepthToDepth,
+    kDepthToColor,
+
+    kColorToStencilBit,
+    kDepthToStencilBit,
+
+    // Two-source modes, using the host depth if it, when converted to the guest
+    // format, matches what's in the owner source (not modified, keep host
+    // precision), or the guest data otherwise (significantly modified, possibly
+    // cleared). Stencil for FragStencilRef is always taken from the guest
+    // source.
+
+    kColorAndHostDepthToDepth,
+    // When using different source and destination depth formats.
+    kDepthAndHostDepthToDepth,
+
+    // If host depth is fetched, but it's the same image as the destination,
+    // it's copied to the EDRAM buffer (but since it's just a scratch buffer,
+    // with tiles laid out linearly with the same pitch as in the original
+    // render target; also no swapping of 40-sample columns as opposed to the
+    // host render target - this is done only for the color source) and fetched
+    // from there instead of the host depth texture.
+    kColorAndHostDepthCopyToDepth,
+    kDepthAndHostDepthCopyToDepth,
+
+    kCount,
+  };
+
+  enum class TransferOutput {
+    kColor,
+    kDepth,
+    kStencilBit,
+  };
+
+  struct TransferModeInfo {
+    TransferOutput output;
+    TransferPipelineLayoutIndex pipeline_layout;
+  };
+
+  static const TransferModeInfo kTransferModes[size_t(TransferMode::kCount)];
+
+  union TransferShaderKey {
+    uint32_t key;
+    struct {
+      xenos::MsaaSamples dest_msaa_samples : xenos::kMsaaSamplesBits;
+      uint32_t dest_color_rt_index : xenos::kColorRenderTargetIndexBits;
+      uint32_t dest_resource_format : xenos::kRenderTargetFormatBits;
+      xenos::MsaaSamples source_msaa_samples : xenos::kMsaaSamplesBits;
+      // Always 1x when the host depth is a copy from a buffer rather than an
+      // image, not to create the same pipeline for different MSAA sample counts
+      // as it doesn't matter in this case.
+      xenos::MsaaSamples host_depth_source_msaa_samples
+          : xenos::kMsaaSamplesBits;
+      uint32_t source_resource_format : xenos::kRenderTargetFormatBits;
+
+      // Last bits because this affects the pipeline layout - after sorting,
+      // only change it as fewer times as possible. Depth buffers have an
+      // additional stencil texture.
+      static_assert(size_t(TransferMode::kCount) <= (size_t(1) << 4));
+      TransferMode mode : 4;
+    };
+
+    TransferShaderKey() : key(0) { static_assert_size(*this, sizeof(key)); }
+
+    struct Hasher {
+      size_t operator()(const TransferShaderKey& key) const {
+        return std::hash<uint32_t>{}(key.key);
+      }
+    };
+    bool operator==(const TransferShaderKey& other_key) const {
+      return key == other_key.key;
+    }
+    bool operator!=(const TransferShaderKey& other_key) const {
+      return !(*this == other_key);
+    }
+    bool operator<(const TransferShaderKey& other_key) const {
+      return key < other_key.key;
+    }
+  };
+
+  struct TransferPipelineKey {
+    RenderPassKey render_pass_key;
+    TransferShaderKey shader_key;
+
+    TransferPipelineKey(RenderPassKey render_pass_key,
+                        TransferShaderKey shader_key)
+        : render_pass_key(render_pass_key), shader_key(shader_key) {}
+
+    struct Hasher {
+      size_t operator()(const TransferPipelineKey& key) const {
+        XXH3_state_t hash_state;
+        XXH3_64bits_reset(&hash_state);
+        XXH3_64bits_update(&hash_state, &key.render_pass_key,
+                           sizeof(key.render_pass_key));
+        XXH3_64bits_update(&hash_state, &key.shader_key,
+                           sizeof(key.shader_key));
+        return static_cast<size_t>(XXH3_64bits_digest(&hash_state));
+      }
+    };
+    bool operator==(const TransferPipelineKey& other_key) const {
+      return render_pass_key == other_key.render_pass_key &&
+             shader_key == other_key.shader_key;
+    }
+    bool operator!=(const TransferPipelineKey& other_key) const {
+      return !(*this == other_key);
+    }
+    bool operator<(const TransferPipelineKey& other_key) const {
+      if (render_pass_key != other_key.render_pass_key) {
+        return render_pass_key < other_key.render_pass_key;
+      }
+      return shader_key < other_key.shader_key;
+    }
+  };
+
+  union TransferAddressConstant {
+    uint32_t constant;
+    struct {
+      // All in tiles.
+      uint32_t dest_pitch : xenos::kEdramPitchTilesBits;
+      uint32_t source_pitch : xenos::kEdramPitchTilesBits;
+      // Safe to use 12 bits for signed difference - no ownership transfer can
+      // ever occur between render targets with EDRAM base >= 2048 as this would
+      // result in 0-length spans. 10 + 10 + 12 is exactly 32, any more bits,
+      // and more root 32-bit constants will be used.
+      // Destination base in tiles minus source base in tiles (not vice versa
+      // because this is a transform of the coordinate system, not addresses
+      // themselves).
+      // 0 for host_depth_source_is_copy (ignored in this case anyway as
+      // destination == source anyway).
+      int32_t source_to_dest : xenos::kEdramBaseTilesBits;
+    };
+    TransferAddressConstant() : constant(0) {
+      static_assert_size(*this, sizeof(constant));
+    }
+    bool operator==(const TransferAddressConstant& other_constant) const {
+      return constant == other_constant.constant;
+    }
+    bool operator!=(const TransferAddressConstant& other_constant) const {
+      return !(*this == other_constant);
+    }
+  };
+
+  struct TransferInvocation {
+    Transfer transfer;
+    TransferShaderKey shader_key;
+    TransferInvocation(const Transfer& transfer,
+                       const TransferShaderKey& shader_key)
+        : transfer(transfer), shader_key(shader_key) {}
+    bool operator<(const TransferInvocation& other_invocation) {
+      // TODO(Triang3l): See if it may be better to sort by the source in the
+      // first place, especially when reading the same data multiple times (like
+      // to write the stencil bits after depth) for better read locality.
+      // Sort by the shader key primarily to reduce pipeline state (context)
+      // switches.
+      if (shader_key != other_invocation.shader_key) {
+        return shader_key < other_invocation.shader_key;
+      }
+      // Host depth render targets are changed rarely if they exist, won't save
+      // many binding changes, ignore them for simplicity (their existence is
+      // caught by the shader key change).
+      assert_not_null(transfer.source);
+      assert_not_null(other_invocation.transfer.source);
+      uint32_t source_index =
+          static_cast<const VulkanRenderTarget*>(transfer.source)
+              ->temporary_sort_index();
+      uint32_t other_source_index = static_cast<const VulkanRenderTarget*>(
+                                        other_invocation.transfer.source)
+                                        ->temporary_sort_index();
+      if (source_index != other_source_index) {
+        return source_index < other_source_index;
+      }
+      return transfer.start_tiles < other_invocation.transfer.start_tiles;
+    }
+    bool CanBeMergedIntoOneDraw(
+        const TransferInvocation& other_invocation) const {
+      return shader_key == other_invocation.shader_key &&
+             transfer.AreSourcesSame(other_invocation.transfer);
+    }
+  };
+
   // Returns the framebuffer object, or VK_NULL_HANDLE if failed to create.
   const Framebuffer* GetFramebuffer(
       RenderPassKey render_pass_key, uint32_t pitch_tiles_at_32bpp,
       const RenderTarget* const* depth_and_color_render_targets);
 
+  VkShaderModule GetTransferShader(TransferShaderKey key);
+  // With sample-rate shading, returns a pointer to one pipeline. Without
+  // sample-rate shading, returns a pointer to as many pipelines as there are
+  // samples. If there was a failure to create a pipeline, returns nullptr.
+  VkPipeline const* GetTransferPipelines(TransferPipelineKey key);
+
+  // Do ownership transfers for render targets - each render target / vector may
+  // be null / empty in case there's nothing to do for them.
+  // resolve_clear_rectangle is expected to be provided by
+  // PrepareHostRenderTargetsResolveClear which should do all the needed size
+  // bound checks.
+  void PerformTransfersAndResolveClears(
+      uint32_t render_target_count, RenderTarget* const* render_targets,
+      const std::vector<Transfer>* render_target_transfers,
+      const uint64_t* render_target_resolve_clear_values = nullptr,
+      const Transfer::Rectangle* resolve_clear_rectangle = nullptr);
+
   bool gamma_render_target_as_srgb_ = false;
 
+  bool msaa_2x_attachments_supported_ = false;
+  bool msaa_2x_no_attachments_supported_ = false;
+
   std::unordered_map<FramebufferKey, Framebuffer, FramebufferKey::Hasher>
       framebuffers_;
 
@@ -271,6 +665,32 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
       last_update_framebuffer_attachments_[1 + xenos::kMaxColorRenderTargets] =
           {};
   const Framebuffer* last_update_framebuffer_ = VK_NULL_HANDLE;
+
+  // Set 0 - EDRAM storage buffer, set 1 - source depth sampled image (and
+  // unused stencil from the transfer descriptor set), HostDepthStoreConstants
+  // passed via push constants.
+  VkPipelineLayout host_depth_store_pipeline_layout_ = VK_NULL_HANDLE;
+  VkPipeline host_depth_store_pipelines_[size_t(xenos::MsaaSamples::k4X) + 1] =
+      {};
+
+  std::unique_ptr<ui::vulkan::VulkanUploadBufferPool>
+      transfer_vertex_buffer_pool_;
+  VkShaderModule transfer_passthrough_vertex_shader_ = VK_NULL_HANDLE;
+  VkPipelineLayout transfer_pipeline_layouts_[size_t(
+      TransferPipelineLayoutIndex::kCount)] = {};
+  // VK_NULL_HANDLE if failed to create.
+  std::unordered_map<TransferShaderKey, VkShaderModule,
+                     TransferShaderKey::Hasher>
+      transfer_shaders_;
+  // With sample-rate shading, one pipeline per entry. Without sample-rate
+  // shading, one pipeline per sample per entry. VK_NULL_HANDLE if failed to
+  // create.
+  std::unordered_map<TransferPipelineKey, std::array<VkPipeline, 4>,
+                     TransferPipelineKey::Hasher>
+      transfer_pipelines_;
+
+  // Temporary storage for PerformTransfersAndResolveClears.
+  std::vector<TransferInvocation> current_transfer_invocations_;
 };
 
 }  // namespace vulkan
diff --git a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
index 0d95189da..788b8166a 100644
--- a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
+++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
@@ -177,6 +177,10 @@ bool VulkanSharedMemory::Initialize() {
     }
   }
 
+  // The first usage will likely be uploading.
+  last_usage_ = Usage::kTransferDestination;
+  last_written_range_ = std::make_pair<uint32_t, uint32_t>(0, 0);
+
   upload_buffer_pool_ = std::make_unique<ui::vulkan::VulkanUploadBufferPool>(
       provider, VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
       xe::align(ui::vulkan::VulkanUploadBufferPool::kDefaultPageSize,
@@ -190,9 +194,6 @@ void VulkanSharedMemory::Shutdown(bool from_destructor) {
 
   upload_buffer_pool_.reset();
 
-  last_written_range_ = std::make_pair<uint32_t, uint32_t>(0, 0);
-  last_usage_ = Usage::kTransferDestination;
-
   const ui::vulkan::VulkanProvider& provider =
       command_processor_.GetVulkanProvider();
   const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
@@ -226,8 +227,8 @@ void VulkanSharedMemory::Use(Usage usage,
   if (last_usage_ != usage || last_written_range_.second) {
     VkPipelineStageFlags src_stage_mask, dst_stage_mask;
     VkAccessFlags src_access_mask, dst_access_mask;
-    GetBarrier(last_usage_, src_stage_mask, src_access_mask);
-    GetBarrier(usage, dst_stage_mask, dst_access_mask);
+    GetUsageMasks(last_usage_, src_stage_mask, src_access_mask);
+    GetUsageMasks(usage, dst_stage_mask, dst_access_mask);
     VkDeviceSize offset, size;
     if (last_usage_ == usage) {
       // Committing the previous write, while not changing the access mask
@@ -447,9 +448,9 @@ bool VulkanSharedMemory::UploadRanges(
   return successful;
 }
 
-void VulkanSharedMemory::GetBarrier(Usage usage,
-                                    VkPipelineStageFlags& stage_mask,
-                                    VkAccessFlags& access_mask) const {
+void VulkanSharedMemory::GetUsageMasks(Usage usage,
+                                       VkPipelineStageFlags& stage_mask,
+                                       VkAccessFlags& access_mask) const {
   switch (usage) {
     case Usage::kComputeWrite:
       stage_mask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
diff --git a/src/xenia/gpu/vulkan/vulkan_shared_memory.h b/src/xenia/gpu/vulkan/vulkan_shared_memory.h
index 0d8e90813..b37949ec8 100644
--- a/src/xenia/gpu/vulkan/vulkan_shared_memory.h
+++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.h
@@ -47,8 +47,8 @@ class VulkanSharedMemory : public SharedMemory {
     kComputeWrite,
     kTransferDestination,
   };
-  // Places pipeline barrier for the target usage, also ensuring writes of
-  // adjacent are ordered with writes of each other and reads.
+  // Inserts a pipeline barrier for the target usage, also ensuring consecutive
+  // read-write accesses are ordered with each other.
   void Use(Usage usage, std::pair<uint32_t, uint32_t> written_range = {});
 
   VkBuffer buffer() const { return buffer_; }
@@ -65,8 +65,8 @@ class VulkanSharedMemory : public SharedMemory {
                         upload_page_ranges) override;
 
  private:
-  void GetBarrier(Usage usage, VkPipelineStageFlags& stage_mask,
-                  VkAccessFlags& access_mask) const;
+  void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask,
+                     VkAccessFlags& access_mask) const;
 
   VulkanCommandProcessor& command_processor_;
   TraceWriter& trace_writer_;
@@ -76,9 +76,8 @@ class VulkanSharedMemory : public SharedMemory {
   // Single for non-sparse, every allocation so far for sparse.
   std::vector<VkDeviceMemory> buffer_memory_;
 
-  // First usage will likely be uploading.
-  Usage last_usage_ = Usage::kTransferDestination;
-  std::pair<uint32_t, uint32_t> last_written_range_ = {};
+  Usage last_usage_;
+  std::pair<uint32_t, uint32_t> last_written_range_;
 
   std::unique_ptr<ui::vulkan::VulkanUploadBufferPool> upload_buffer_pool_;
   std::vector<VkBufferCopy> upload_regions_;
diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h
index d2279a7b8..2f88bc74c 100644
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@@ -248,6 +248,7 @@ enum class MsaaSamples : uint32_t {
 
 constexpr uint32_t kMsaaSamplesBits = 2;
 
+constexpr uint32_t kColorRenderTargetIndexBits = 2;
 constexpr uint32_t kMaxColorRenderTargets = 4;
 
 enum class ColorRenderTargetFormat : uint32_t {
diff --git a/src/xenia/ui/vulkan/functions/device_1_0.inc b/src/xenia/ui/vulkan/functions/device_1_0.inc
index 2a979f55f..148d6dd52 100644
--- a/src/xenia/ui/vulkan/functions/device_1_0.inc
+++ b/src/xenia/ui/vulkan/functions/device_1_0.inc
@@ -15,6 +15,7 @@ XE_UI_VULKAN_FUNCTION(vkCmdClearColorImage)
 XE_UI_VULKAN_FUNCTION(vkCmdCopyBuffer)
 XE_UI_VULKAN_FUNCTION(vkCmdCopyBufferToImage)
 XE_UI_VULKAN_FUNCTION(vkCmdCopyImageToBuffer)
+XE_UI_VULKAN_FUNCTION(vkCmdDispatch)
 XE_UI_VULKAN_FUNCTION(vkCmdDraw)
 XE_UI_VULKAN_FUNCTION(vkCmdDrawIndexed)
 XE_UI_VULKAN_FUNCTION(vkCmdEndRenderPass)
@@ -29,6 +30,7 @@ XE_UI_VULKAN_FUNCTION(vkCmdSetStencilWriteMask)
 XE_UI_VULKAN_FUNCTION(vkCmdSetViewport)
 XE_UI_VULKAN_FUNCTION(vkCreateBuffer)
 XE_UI_VULKAN_FUNCTION(vkCreateCommandPool)
+XE_UI_VULKAN_FUNCTION(vkCreateComputePipelines)
 XE_UI_VULKAN_FUNCTION(vkCreateDescriptorPool)
 XE_UI_VULKAN_FUNCTION(vkCreateDescriptorSetLayout)
 XE_UI_VULKAN_FUNCTION(vkCreateFence)
diff --git a/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.cc b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.cc
new file mode 100644
index 000000000..8dfff2a3f
--- /dev/null
+++ b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.cc
@@ -0,0 +1,120 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/ui/vulkan/single_layout_descriptor_set_pool.h"
+
+#include "xenia/base/assert.h"
+#include "xenia/base/logging.h"
+
+namespace xe {
+namespace ui {
+namespace vulkan {
+
+SingleLayoutDescriptorSetPool::SingleLayoutDescriptorSetPool(
+    const VulkanProvider& provider, uint32_t pool_set_count,
+    uint32_t set_layout_descriptor_counts_count,
+    const VkDescriptorPoolSize* set_layout_descriptor_counts,
+    VkDescriptorSetLayout set_layout)
+    : provider_(provider),
+      pool_set_count_(pool_set_count),
+      set_layout_(set_layout) {
+  assert_not_zero(pool_set_count);
+  pool_descriptor_counts_.resize(set_layout_descriptor_counts_count);
+  for (uint32_t i = 0; i < set_layout_descriptor_counts_count; ++i) {
+    VkDescriptorPoolSize& pool_descriptor_type_count =
+        pool_descriptor_counts_[i];
+    const VkDescriptorPoolSize& set_layout_descriptor_type_count =
+        set_layout_descriptor_counts[i];
+    pool_descriptor_type_count.type = set_layout_descriptor_type_count.type;
+    pool_descriptor_type_count.descriptorCount =
+        set_layout_descriptor_type_count.descriptorCount * pool_set_count;
+  }
+}
+
+SingleLayoutDescriptorSetPool::~SingleLayoutDescriptorSetPool() {
+  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider_.dfn();
+  VkDevice device = provider_.device();
+  if (current_pool_ != VK_NULL_HANDLE) {
+    dfn.vkDestroyDescriptorPool(device, current_pool_, nullptr);
+  }
+  for (VkDescriptorPool pool : full_pools_) {
+    dfn.vkDestroyDescriptorPool(device, pool, nullptr);
+  }
+}
+
+size_t SingleLayoutDescriptorSetPool::Allocate() {
+  if (!descriptor_sets_free_.empty()) {
+    size_t free_index = descriptor_sets_free_.back();
+    descriptor_sets_free_.pop_back();
+    return free_index;
+  }
+
+  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider_.dfn();
+  VkDevice device = provider_.device();
+
+  // Two iterations so if vkAllocateDescriptorSets fails even with a non-zero
+  // current_pool_sets_remaining_, another attempt will be made in a new pool.
+  for (uint32_t i = 0; i < 2; ++i) {
+    if (current_pool_ != VK_NULL_HANDLE && !current_pool_sets_remaining_) {
+      full_pools_.push_back(current_pool_);
+      current_pool_ = VK_NULL_HANDLE;
+    }
+    if (current_pool_ == VK_NULL_HANDLE) {
+      VkDescriptorPoolCreateInfo pool_create_info;
+      pool_create_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+      pool_create_info.pNext = nullptr;
+      pool_create_info.flags = 0;
+      pool_create_info.maxSets = pool_set_count_;
+      pool_create_info.poolSizeCount = uint32_t(pool_descriptor_counts_.size());
+      pool_create_info.pPoolSizes = pool_descriptor_counts_.data();
+      if (dfn.vkCreateDescriptorPool(device, &pool_create_info, nullptr,
+                                     &current_pool_) != VK_SUCCESS) {
+        XELOGE(
+            "SingleLayoutDescriptorSetPool: Failed to create a descriptor "
+            "pool");
+        return SIZE_MAX;
+      }
+      current_pool_sets_remaining_ = pool_set_count_;
+    }
+
+    VkDescriptorSetAllocateInfo descriptor_set_allocate_info;
+    descriptor_set_allocate_info.sType =
+        VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+    descriptor_set_allocate_info.pNext = nullptr;
+    descriptor_set_allocate_info.descriptorPool = current_pool_;
+    descriptor_set_allocate_info.descriptorSetCount = 1;
+    descriptor_set_allocate_info.pSetLayouts = &set_layout_;
+    VkDescriptorSet descriptor_set;
+    if (dfn.vkAllocateDescriptorSets(device, &descriptor_set_allocate_info,
+                                     &descriptor_set) != VK_SUCCESS) {
+      XELOGE(
+          "SingleLayoutDescriptorSetPool: Failed to allocate a descriptor "
+          "layout");
+      if (current_pool_sets_remaining_ >= pool_set_count_) {
+        // Failed to allocate in a new pool - something completely wrong, don't
+        // store empty pools as full.
+        dfn.vkDestroyDescriptorPool(device, current_pool_, nullptr);
+        current_pool_ = VK_NULL_HANDLE;
+        return SIZE_MAX;
+      }
+      full_pools_.push_back(current_pool_);
+      current_pool_ = VK_NULL_HANDLE;
+    }
+    --current_pool_sets_remaining_;
+    descriptor_sets_.push_back(descriptor_set);
+    return descriptor_sets_.size() - 1;
+  }
+
+  // Both attempts have failed.
+  return SIZE_MAX;
+}
+
+}  // namespace vulkan
+}  // namespace ui
+}  // namespace xe
diff --git a/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.h b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.h
new file mode 100644
index 000000000..c3f3eb080
--- /dev/null
+++ b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.h
@@ -0,0 +1,63 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_UI_VULKAN_SINGLE_DESCRIPTOR_SET_POOL_H_
+#define XENIA_UI_VULKAN_SINGLE_DESCRIPTOR_SET_POOL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "xenia/base/assert.h"
+#include "xenia/ui/vulkan/vulkan_provider.h"
+
+namespace xe {
+namespace ui {
+namespace vulkan {
+
+class SingleLayoutDescriptorSetPool {
+ public:
+  // set_layout_descriptor_counts must contain the numbers of descriptors of
+  // each type in a single set with the layout (the multiplication by the pool
+  // set count will be done internally). The descriptor set layout must not be
+  // destroyed until this object is also destroyed.
+  SingleLayoutDescriptorSetPool(
+      const VulkanProvider& provider, uint32_t pool_set_count,
+      uint32_t set_layout_descriptor_counts_count,
+      const VkDescriptorPoolSize* set_layout_descriptor_counts,
+      VkDescriptorSetLayout set_layout);
+  ~SingleLayoutDescriptorSetPool();
+
+  // Returns SIZE_MAX in case of a failure.
+  size_t Allocate();
+  void Free(size_t index) {
+    assert_true(index < descriptor_sets_.size());
+    descriptor_sets_free_.push_back(index);
+  }
+  VkDescriptorSet Get(size_t index) const { return descriptor_sets_[index]; }
+
+ private:
+  const VulkanProvider& provider_;
+  uint32_t pool_set_count_;
+  std::vector<VkDescriptorPoolSize> pool_descriptor_counts_;
+  VkDescriptorSetLayout set_layout_;
+
+  std::vector<VkDescriptorPool> full_pools_;
+  VkDescriptorPool current_pool_ = VK_NULL_HANDLE;
+  uint32_t current_pool_sets_remaining_ = 0;
+
+  std::vector<VkDescriptorSet> descriptor_sets_;
+  std::vector<size_t> descriptor_sets_free_;
+};
+
+}  // namespace vulkan
+}  // namespace ui
+}  // namespace xe
+
+#endif  // XENIA_UI_VULKAN_SINGLE_DESCRIPTOR_SET_POOL_H_
diff --git a/src/xenia/ui/vulkan/vulkan_provider.cc b/src/xenia/ui/vulkan/vulkan_provider.cc
index 2d93485ff..eb48cfa23 100644
--- a/src/xenia/ui/vulkan/vulkan_provider.cc
+++ b/src/xenia/ui/vulkan/vulkan_provider.cc
@@ -715,6 +715,8 @@ bool VulkanProvider::Initialize() {
     static const std::pair<const char*, size_t> kUsedDeviceExtensions[] = {
         {"VK_EXT_fragment_shader_interlock",
          offsetof(DeviceExtensions, ext_fragment_shader_interlock)},
+        {"VK_EXT_shader_stencil_export",
+         offsetof(DeviceExtensions, ext_shader_stencil_export)},
         {"VK_KHR_dedicated_allocation",
          offsetof(DeviceExtensions, khr_dedicated_allocation)},
         {"VK_KHR_image_format_list",
@@ -946,6 +948,8 @@ bool VulkanProvider::Initialize() {
   XELOGVK("Vulkan device extensions:");
   XELOGVK("* VK_EXT_fragment_shader_interlock: {}",
           device_extensions_.ext_fragment_shader_interlock ? "yes" : "no");
+  XELOGVK("* VK_EXT_shader_stencil_export: {}",
+          device_extensions_.ext_shader_stencil_export ? "yes" : "no");
   XELOGVK("* VK_KHR_dedicated_allocation: {}",
           device_extensions_.khr_dedicated_allocation ? "yes" : "no");
   XELOGVK("* VK_KHR_image_format_list: {}",
diff --git a/src/xenia/ui/vulkan/vulkan_provider.h b/src/xenia/ui/vulkan/vulkan_provider.h
index 0887b88ac..83f4d587f 100644
--- a/src/xenia/ui/vulkan/vulkan_provider.h
+++ b/src/xenia/ui/vulkan/vulkan_provider.h
@@ -132,6 +132,7 @@ class VulkanProvider : public GraphicsProvider {
   }
   struct DeviceExtensions {
     bool ext_fragment_shader_interlock;
+    bool ext_shader_stencil_export;
     // Core since 1.1.0.
     bool khr_dedicated_allocation;
     // Core since 1.2.0.
diff --git a/src/xenia/ui/vulkan/vulkan_util.cc b/src/xenia/ui/vulkan/vulkan_util.cc
index f8dd5846e..b4eb02c3f 100644
--- a/src/xenia/ui/vulkan/vulkan_util.cc
+++ b/src/xenia/ui/vulkan/vulkan_util.cc
@@ -189,6 +189,53 @@ bool CreateDedicatedAllocationImage(const VulkanProvider& provider,
   return true;
 }
 
+VkPipeline CreateComputePipeline(
+    const VulkanProvider& provider, VkPipelineLayout layout,
+    VkShaderModule shader, const VkSpecializationInfo* specialization_info,
+    const char* entry_point) {
+  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
+  VkDevice device = provider.device();
+  VkComputePipelineCreateInfo pipeline_create_info;
+  pipeline_create_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+  pipeline_create_info.pNext = nullptr;
+  pipeline_create_info.flags = 0;
+  pipeline_create_info.stage.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+  pipeline_create_info.stage.pNext = nullptr;
+  pipeline_create_info.stage.flags = 0;
+  pipeline_create_info.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+  pipeline_create_info.stage.module = shader;
+  pipeline_create_info.stage.pName = entry_point;
+  pipeline_create_info.stage.pSpecializationInfo = specialization_info;
+  pipeline_create_info.layout = layout;
+  pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE;
+  pipeline_create_info.basePipelineIndex = -1;
+  VkPipeline pipeline;
+  if (dfn.vkCreateComputePipelines(device, VK_NULL_HANDLE, 1,
+                                   &pipeline_create_info, nullptr,
+                                   &pipeline) != VK_SUCCESS) {
+    return VK_NULL_HANDLE;
+  }
+  return pipeline;
+}
+
+VkPipeline CreateComputePipeline(
+    const VulkanProvider& provider, VkPipelineLayout layout,
+    const uint32_t* shader_code, size_t shader_code_size_bytes,
+    const VkSpecializationInfo* specialization_info, const char* entry_point) {
+  VkShaderModule shader =
+      CreateShaderModule(provider, shader_code, shader_code_size_bytes);
+  if (shader == VK_NULL_HANDLE) {
+    return VK_NULL_HANDLE;
+  }
+  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
+  VkDevice device = provider.device();
+  VkPipeline pipeline = CreateComputePipeline(provider, layout, shader,
+                                              specialization_info, entry_point);
+  dfn.vkDestroyShaderModule(device, shader, nullptr);
+  return pipeline;
+}
+
 }  // namespace util
 }  // namespace vulkan
 }  // namespace ui
diff --git a/src/xenia/ui/vulkan/vulkan_util.h b/src/xenia/ui/vulkan/vulkan_util.h
index fda575305..7af10f65f 100644
--- a/src/xenia/ui/vulkan/vulkan_util.h
+++ b/src/xenia/ui/vulkan/vulkan_util.h
@@ -164,6 +164,17 @@ inline VkShaderModule CreateShaderModule(const VulkanProvider& provider,
              : VK_NULL_HANDLE;
 }
 
+VkPipeline CreateComputePipeline(
+    const VulkanProvider& provider, VkPipelineLayout layout,
+    VkShaderModule shader,
+    const VkSpecializationInfo* specialization_info = nullptr,
+    const char* entry_point = "main");
+VkPipeline CreateComputePipeline(
+    const VulkanProvider& provider, VkPipelineLayout layout,
+    const uint32_t* shader_code, size_t shader_code_size_bytes,
+    const VkSpecializationInfo* specialization_info = nullptr,
+    const char* entry_point = "main");
+
 }  // namespace util
 }  // namespace vulkan
 }  // namespace ui