[Vulkan] EDRAM range ownership transfers, resolve clears, 2x-as-4x MSAA

Transfers are functional on a D3D12-like level, but need additional work so fallbacks are used when multisampled integer sampled images are not supported, and to eliminate transfers between render targets within Vulkan format compatibility classes by using different views directly.
2022-04-03 16:40:29 +03:00 · 2022-04-03 16:40:29 +03:00 · 0acb97d383
parent 85fc7036b8
commit 0acb97d383
22 changed files with 5668 additions and 144 deletions
--- a/src/xenia/gpu/render_target_cache.h
+++ b/src/xenia/gpu/render_target_cache.h
@ -302,6 +302,10 @@ class RenderTargetCache {
      }
      return xenos::IsColorRenderTargetFormat64bpp(GetColorFormat());
    }
+    const char* GetFormatName() const {
+      return is_depth ? xenos::GetDepthRenderTargetFormatName(GetDepthFormat())
+                      : xenos::GetColorRenderTargetFormatName(GetColorFormat());
+    }

    uint32_t GetPitchTiles() const {
      return pitch_tiles_at_32bpp << uint32_t(Is64bpp());
@ -317,11 +321,9 @@ class RenderTargetCache {
    }

    std::string GetDebugName() const {
-      return fmt::format(
-          "RT @ {}t, <{}t>, {}xMSAA, {}", base_tiles, GetPitchTiles(),
-          uint32_t(1) << uint32_t(msaa_samples),
-          is_depth ? xenos::GetDepthRenderTargetFormatName(GetDepthFormat())
-                   : xenos::GetColorRenderTargetFormatName(GetColorFormat()));
+      return fmt::format("RT @ {}t, <{}t>, {}xMSAA, {}", base_tiles,
+                         GetPitchTiles(), uint32_t(1) << uint32_t(msaa_samples),
+                         GetFormatName());
    }
  };

--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@ -113,11 +113,9 @@ uint32_t SpirvShaderTranslator::GetModificationRegisterCount() const {
 }

 void SpirvShaderTranslator::StartTranslation() {
-  // Tool ID 26 "Xenia Emulator Microcode Translator".
-  // https://github.com/KhronosGroup/SPIRV-Headers/blob/c43a43c7cc3af55910b9bec2a71e3e8a622443cf/include/spirv/spir-v.xml#L79
  // TODO(Triang3l): Logger.
-  builder_ = std::make_unique<spv::Builder>(features_.spirv_version,
-                                            (26 << 16) | 1, nullptr);
+  builder_ = std::make_unique<spv::Builder>(
+      features_.spirv_version, (kSpirvMagicToolId << 16) | 1, nullptr);

  builder_->addCapability(IsSpirvTessEvalShader() ? spv::CapabilityTessellation
                                                  : spv::CapabilityShader);
@ -1535,20 +1533,20 @@ spv::Id SpirvShaderTranslator::GetUnmodifiedOperandComponents(
        static_cast<unsigned int>(original_operand.GetComponent(scalar_index)) -
            static_cast<unsigned int>(SwizzleSource::kX));
  }
-  id_vector_temp_util_.clear();
-  id_vector_temp_util_.reserve(component_count);
+  uint_vector_temp_util_.clear();
+  uint_vector_temp_util_.reserve(component_count);
  uint32_t components_remaining = components;
  uint32_t component_index;
  while (xe::bit_scan_forward(components_remaining, &component_index)) {
    components_remaining &= ~(uint32_t(1) << component_index);
-    id_vector_temp_util_.push_back(
+    uint_vector_temp_util_.push_back(
        static_cast<unsigned int>(
            original_operand.GetComponent(component_index)) -
        static_cast<unsigned int>(SwizzleSource::kX));
  }
  return builder_->createRvalueSwizzle(spv::NoPrecision,
                                       type_float_vectors_[component_count - 1],
-                                       operand_storage, id_vector_temp_util_);
+                                       operand_storage, uint_vector_temp_util_);
 }

 void SpirvShaderTranslator::GetOperandScalarXY(
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@ -138,6 +138,10 @@ class SpirvShaderTranslator : public ShaderTranslator {
    kDescriptorSetCount,
  };

+  // "Xenia Emulator Microcode Translator".
+  // https://github.com/KhronosGroup/SPIRV-Headers/blob/c43a43c7cc3af55910b9bec2a71e3e8a622443cf/include/spirv/spir-v.xml#L79
+  static constexpr uint32_t kSpirvMagicToolId = 26;
+
  struct Features {
    explicit Features(const ui::vulkan::VulkanProvider& provider);
    explicit Features(bool all = false);
@ -172,6 +176,38 @@ class SpirvShaderTranslator : public ShaderTranslator {
        features_.max_storage_buffer_range);
  }

+  // Common functions useful not only for the translator, but also for EDRAM
+  // emulation via conventional render targets.
+
+  // Converts the color value externally clamped to [0, 31.875] to 7e3 floating
+  // point, with zeros in bits 10:31, rounding to the nearest even.
+  static spv::Id PreClampedFloat32To7e3(spv::Builder& builder,
+                                        spv::Id f32_scalar,
+                                        spv::Id ext_inst_glsl_std_450);
+  // Same as PreClampedFloat32To7e3, but clamps the input to [0, 31.875].
+  static spv::Id UnclampedFloat32To7e3(spv::Builder& builder,
+                                       spv::Id f32_scalar,
+                                       spv::Id ext_inst_glsl_std_450);
+  // Converts the 7e3 number in bits [f10_shift, f10_shift + 10) to a 32-bit
+  // float.
+  static spv::Id Float7e3To32(spv::Builder& builder, spv::Id f10_uint_scalar,
+                              uint32_t f10_shift, bool result_as_uint,
+                              spv::Id ext_inst_glsl_std_450);
+  // Converts the depth value externally clamped to the representable [0, 2)
+  // range to 20e4 floating point, with zeros in bits 24:31, rounding to the
+  // nearest even. If remap_from_0_to_0_5 is true, it's assumed that 0...1 is
+  // pre-remapped to 0...0.5 in the input.
+  static spv::Id PreClampedDepthTo20e4(spv::Builder& builder,
+                                       spv::Id f32_scalar,
+                                       bool remap_from_0_to_0_5,
+                                       spv::Id ext_inst_glsl_std_450);
+  // Converts the 20e4 number in bits [f24_shift, f24_shift + 10) to a 32-bit
+  // float.
+  static spv::Id Depth20e4To32(spv::Builder& builder, spv::Id f24_uint_scalar,
+                               uint32_t f24_shift, bool remap_to_0_to_0_5,
+                               bool result_as_uint,
+                               spv::Id ext_inst_glsl_std_450);
+
 protected:
  void Reset() override;

--- a/src/xenia/gpu/spirv_shader_translator_rb.cc
+++ b/src/xenia/gpu/spirv_shader_translator_rb.cc
@ -0,0 +1,425 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/gpu/spirv_shader_translator.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "third_party/glslang/SPIRV/GLSL.std.450.h"
+#include "xenia/base/assert.h"
+
+namespace xe {
+namespace gpu {
+
+spv::Id SpirvShaderTranslator::PreClampedFloat32To7e3(
+    spv::Builder& builder, spv::Id f32_scalar, spv::Id ext_inst_glsl_std_450) {
+  // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
+  // Assuming the value is already clamped to [0, 31.875].
+
+  spv::Id type_uint = builder.makeUintType(32);
+
+  // Need the source as uint for bit operations.
+  {
+    spv::Id source_type = builder.getTypeId(f32_scalar);
+    assert_true(builder.isScalarType(source_type));
+    if (!builder.isUintType(source_type)) {
+      f32_scalar = builder.createUnaryOp(spv::OpBitcast, type_uint, f32_scalar);
+    }
+  }
+
+  // The denormal 7e3 case.
+  // denormal_biased_f32 = (f32 & 0x7FFFFF) | 0x800000
+  spv::Id denormal_biased_f32;
+  {
+    spv::Instruction* denormal_insert_instruction = new spv::Instruction(
+        builder.getUniqueId(), type_uint, spv::OpBitFieldInsert);
+    denormal_insert_instruction->addIdOperand(f32_scalar);
+    denormal_insert_instruction->addIdOperand(builder.makeUintConstant(1));
+    denormal_insert_instruction->addIdOperand(builder.makeUintConstant(23));
+    denormal_insert_instruction->addIdOperand(builder.makeUintConstant(9));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(denormal_insert_instruction));
+    denormal_biased_f32 = denormal_insert_instruction->getResultId();
+  }
+  // denormal_biased_f32_shift_amount = min(125 - (f32 >> 23), 24)
+  // Not allowing the shift to overflow as that's undefined in SPIR-V.
+  spv::Id denormal_biased_f32_shift_amount;
+  {
+    spv::Instruction* denormal_shift_amount_instruction =
+        new spv::Instruction(builder.getUniqueId(), type_uint, spv::OpExtInst);
+    denormal_shift_amount_instruction->addIdOperand(ext_inst_glsl_std_450);
+    denormal_shift_amount_instruction->addImmediateOperand(GLSLstd450UMin);
+    denormal_shift_amount_instruction->addIdOperand(builder.createBinOp(
+        spv::OpISub, type_uint, builder.makeUintConstant(125),
+        builder.createBinOp(spv::OpShiftRightLogical, type_uint, f32_scalar,
+                            builder.makeUintConstant(23))));
+    denormal_shift_amount_instruction->addIdOperand(
+        builder.makeUintConstant(24));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(denormal_shift_amount_instruction));
+    denormal_biased_f32_shift_amount =
+        denormal_shift_amount_instruction->getResultId();
+  }
+  // denormal_biased_f32 =
+  //     ((f32 & 0x7FFFFF) | 0x800000) >> min(125 - (f32 >> 23), 24)
+  denormal_biased_f32 = builder.createBinOp(spv::OpShiftRightLogical, type_uint,
+                                            denormal_biased_f32,
+                                            denormal_biased_f32_shift_amount);
+
+  // The normal 7e3 case.
+  // Bias the exponent.
+  // normal_biased_f32 = f32 - (124 << 23)
+  spv::Id normal_biased_f32 =
+      builder.createBinOp(spv::OpISub, type_uint, f32_scalar,
+                          builder.makeUintConstant(UINT32_C(124) << 23));
+
+  // Select the needed conversion depending on whether the number is too small
+  // to be represented as normalized 7e3.
+  spv::Id biased_f32 = builder.createTriOp(
+      spv::OpSelect, type_uint,
+      builder.createBinOp(spv::OpULessThan, builder.makeBoolType(), f32_scalar,
+                          builder.makeUintConstant(0x3E800000)),
+      denormal_biased_f32, normal_biased_f32);
+
+  // Build the 7e3 number rounding to the nearest even.
+  // ((biased_f32 + 0x7FFF + ((biased_f32 >> 16) & 1)) >> 16) & 0x3FF
+  return builder.createTriOp(
+      spv::OpBitFieldUExtract, type_uint,
+      builder.createBinOp(
+          spv::OpIAdd, type_uint,
+          builder.createBinOp(spv::OpIAdd, type_uint, biased_f32,
+                              builder.makeUintConstant(0x7FFF)),
+          builder.createTriOp(spv::OpBitFieldUExtract, type_uint, biased_f32,
+                              builder.makeUintConstant(16),
+                              builder.makeUintConstant(1))),
+      builder.makeUintConstant(16), builder.makeUintConstant(10));
+}
+
+spv::Id SpirvShaderTranslator::UnclampedFloat32To7e3(
+    spv::Builder& builder, spv::Id f32_scalar, spv::Id ext_inst_glsl_std_450) {
+  spv::Id type_float = builder.makeFloatType(32);
+
+  // Need the source as float for clamping.
+  {
+    spv::Id source_type = builder.getTypeId(f32_scalar);
+    assert_true(builder.isScalarType(source_type));
+    if (!builder.isFloatType(source_type)) {
+      f32_scalar =
+          builder.createUnaryOp(spv::OpBitcast, type_float, f32_scalar);
+    }
+  }
+
+  {
+    spv::Instruction* clamp_instruction =
+        new spv::Instruction(builder.getUniqueId(), type_float, spv::OpExtInst);
+    clamp_instruction->addIdOperand(ext_inst_glsl_std_450);
+    clamp_instruction->addImmediateOperand(GLSLstd450NClamp);
+    clamp_instruction->addIdOperand(f32_scalar);
+    clamp_instruction->addIdOperand(builder.makeFloatConstant(0.0f));
+    clamp_instruction->addIdOperand(builder.makeFloatConstant(31.875f));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(clamp_instruction));
+    f32_scalar = clamp_instruction->getResultId();
+  }
+
+  return PreClampedFloat32To7e3(builder, f32_scalar, ext_inst_glsl_std_450);
+}
+
+spv::Id SpirvShaderTranslator::Float7e3To32(spv::Builder& builder,
+                                            spv::Id f10_uint_scalar,
+                                            uint32_t f10_shift,
+                                            bool result_as_uint,
+                                            spv::Id ext_inst_glsl_std_450) {
+  // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
+
+  assert_true(builder.isUintType(builder.getTypeId(f10_uint_scalar)));
+  assert_true(f10_shift <= (32 - 10));
+
+  spv::Id type_bool = builder.makeBoolType();
+  spv::Id type_int = builder.makeIntType(32);
+  spv::Id type_uint = builder.makeUintType(32);
+
+  spv::Id f10_unbiased_exponent = builder.createTriOp(
+      spv::OpBitFieldUExtract, type_uint, f10_uint_scalar,
+      builder.makeUintConstant(f10_shift + 7), builder.makeUintConstant(3));
+  spv::Id f10_mantissa = builder.createTriOp(
+      spv::OpBitFieldUExtract, type_uint, f10_uint_scalar,
+      builder.makeUintConstant(f10_shift), builder.makeUintConstant(7));
+
+  // The denormal nonzero 7e3 case.
+  // denormal_mantissa_msb = findMSB(f10_mantissa)
+  spv::Id denormal_mantissa_msb;
+  {
+    spv::Instruction* denormal_mantissa_msb_instruction =
+        new spv::Instruction(builder.getUniqueId(), type_int, spv::OpExtInst);
+    denormal_mantissa_msb_instruction->addIdOperand(ext_inst_glsl_std_450);
+    denormal_mantissa_msb_instruction->addImmediateOperand(GLSLstd450FindUMsb);
+    denormal_mantissa_msb_instruction->addIdOperand(f10_mantissa);
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(denormal_mantissa_msb_instruction));
+    denormal_mantissa_msb = denormal_mantissa_msb_instruction->getResultId();
+  }
+  denormal_mantissa_msb =
+      builder.createUnaryOp(spv::OpBitcast, type_uint, denormal_mantissa_msb);
+  // denormal_f32_unbiased_exponent = 1 - (7 - findMSB(f10_mantissa))
+  // Or:
+  // denormal_f32_unbiased_exponent = findMSB(f10_mantissa) - 6
+  spv::Id denormal_f32_unbiased_exponent =
+      builder.createBinOp(spv::OpISub, type_uint, denormal_mantissa_msb,
+                          builder.makeUintConstant(6));
+  // Normalize the mantissa.
+  // denormal_f32_mantissa = f10_mantissa << (7 - findMSB(f10_mantissa))
+  spv::Id denormal_f32_mantissa = builder.createBinOp(
+      spv::OpShiftLeftLogical, type_uint, f10_mantissa,
+      builder.createBinOp(spv::OpISub, type_uint, builder.makeUintConstant(7),
+                          denormal_mantissa_msb));
+  // If the 7e3 number is zero, make sure the float32 number is zero too.
+  spv::Id f10_mantissa_is_nonzero = builder.createBinOp(
+      spv::OpINotEqual, type_bool, f10_mantissa, builder.makeUintConstant(0));
+  // Set the unbiased exponent to -124 for zero - 124 will be added later,
+  // resulting in zero float32.
+  denormal_f32_unbiased_exponent = builder.createTriOp(
+      spv::OpSelect, type_uint, f10_mantissa_is_nonzero,
+      denormal_f32_unbiased_exponent, builder.makeUintConstant(uint32_t(-124)));
+  denormal_f32_mantissa =
+      builder.createTriOp(spv::OpSelect, type_uint, f10_mantissa_is_nonzero,
+                          denormal_f32_mantissa, builder.makeUintConstant(0));
+
+  // Select the needed conversion depending on whether the number is normal.
+  spv::Id f10_is_normal =
+      builder.createBinOp(spv::OpINotEqual, type_bool, f10_unbiased_exponent,
+                          builder.makeUintConstant(0));
+  spv::Id f32_unbiased_exponent = builder.createTriOp(
+      spv::OpSelect, type_uint, f10_is_normal, f10_unbiased_exponent,
+      denormal_f32_unbiased_exponent);
+  spv::Id f32_mantissa =
+      builder.createTriOp(spv::OpSelect, type_uint, f10_is_normal, f10_mantissa,
+                          denormal_f32_mantissa);
+
+  // Bias the exponent and construct the build the float32 number.
+  spv::Id f32_shifted;
+  {
+    spv::Instruction* f32_insert_instruction = new spv::Instruction(
+        builder.getUniqueId(), type_uint, spv::OpBitFieldInsert);
+    f32_insert_instruction->addIdOperand(f32_mantissa);
+    f32_insert_instruction->addIdOperand(
+        builder.createBinOp(spv::OpIAdd, type_uint, f32_unbiased_exponent,
+                            builder.makeUintConstant(124)));
+    f32_insert_instruction->addIdOperand(builder.makeUintConstant(7));
+    f32_insert_instruction->addIdOperand(builder.makeUintConstant(8));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(f32_insert_instruction));
+    f32_shifted = f32_insert_instruction->getResultId();
+  }
+  spv::Id f32 =
+      builder.createBinOp(spv::OpShiftLeftLogical, type_uint, f32_shifted,
+                          builder.makeUintConstant(23 - 7));
+
+  if (!result_as_uint) {
+    f32 = builder.createUnaryOp(spv::OpBitcast, builder.makeFloatType(32), f32);
+  }
+
+  return f32;
+}
+
+spv::Id SpirvShaderTranslator::PreClampedDepthTo20e4(
+    spv::Builder& builder, spv::Id f32_scalar, bool remap_from_0_to_0_5,
+    spv::Id ext_inst_glsl_std_450) {
+  // CFloat24 from d3dref9.dll +
+  // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
+  // Assuming the value is already clamped to [0, 2) (in all places, the depth
+  // is written with saturation).
+
+  uint32_t remap_bias = uint32_t(remap_from_0_to_0_5);
+
+  spv::Id type_uint = builder.makeUintType(32);
+
+  // Need the source as uint for bit operations.
+  {
+    spv::Id source_type = builder.getTypeId(f32_scalar);
+    assert_true(builder.isScalarType(source_type));
+    if (!builder.isUintType(source_type)) {
+      f32_scalar = builder.createUnaryOp(spv::OpBitcast, type_uint, f32_scalar);
+    }
+  }
+
+  // The denormal 20e4 case.
+  // denormal_biased_f32 = (f32 & 0x7FFFFF) | 0x800000
+  spv::Id denormal_biased_f32;
+  {
+    spv::Instruction* denormal_insert_instruction = new spv::Instruction(
+        builder.getUniqueId(), type_uint, spv::OpBitFieldInsert);
+    denormal_insert_instruction->addIdOperand(f32_scalar);
+    denormal_insert_instruction->addIdOperand(builder.makeUintConstant(1));
+    denormal_insert_instruction->addIdOperand(builder.makeUintConstant(23));
+    denormal_insert_instruction->addIdOperand(builder.makeUintConstant(9));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(denormal_insert_instruction));
+    denormal_biased_f32 = denormal_insert_instruction->getResultId();
+  }
+  // denormal_biased_f32_shift_amount = min(113 - (f32 >> 23), 24)
+  // Not allowing the shift to overflow as that's undefined in SPIR-V.
+  spv::Id denormal_biased_f32_shift_amount;
+  {
+    spv::Instruction* denormal_shift_amount_instruction =
+        new spv::Instruction(builder.getUniqueId(), type_uint, spv::OpExtInst);
+    denormal_shift_amount_instruction->addIdOperand(ext_inst_glsl_std_450);
+    denormal_shift_amount_instruction->addImmediateOperand(GLSLstd450UMin);
+    denormal_shift_amount_instruction->addIdOperand(builder.createBinOp(
+        spv::OpISub, type_uint, builder.makeUintConstant(113 - remap_bias),
+        builder.createBinOp(spv::OpShiftRightLogical, type_uint, f32_scalar,
+                            builder.makeUintConstant(23))));
+    denormal_shift_amount_instruction->addIdOperand(
+        builder.makeUintConstant(24));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(denormal_shift_amount_instruction));
+    denormal_biased_f32_shift_amount =
+        denormal_shift_amount_instruction->getResultId();
+  }
+  // denormal_biased_f32 =
+  //     ((f32 & 0x7FFFFF) | 0x800000) >> min(113 - (f32 >> 23), 24)
+  denormal_biased_f32 = builder.createBinOp(spv::OpShiftRightLogical, type_uint,
+                                            denormal_biased_f32,
+                                            denormal_biased_f32_shift_amount);
+
+  // The normal 20e4 case.
+  // Bias the exponent.
+  // normal_biased_f32 = f32 - (112 << 23)
+  spv::Id normal_biased_f32 = builder.createBinOp(
+      spv::OpISub, type_uint, f32_scalar,
+      builder.makeUintConstant((UINT32_C(112) + remap_bias) << 23));
+
+  // Select the needed conversion depending on whether the number is too small
+  // to be represented as normalized 20e4.
+  spv::Id biased_f32 = builder.createTriOp(
+      spv::OpSelect, type_uint,
+      builder.createBinOp(
+          spv::OpULessThan, builder.makeBoolType(), f32_scalar,
+          builder.makeUintConstant(0x38800000 - (remap_bias << 23))),
+      denormal_biased_f32, normal_biased_f32);
+
+  // Build the 20e4 number rounding to the nearest even.
+  // ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF
+  return builder.createTriOp(
+      spv::OpBitFieldUExtract, type_uint,
+      builder.createBinOp(
+          spv::OpIAdd, type_uint,
+          builder.createBinOp(spv::OpIAdd, type_uint, biased_f32,
+                              builder.makeUintConstant(3)),
+          builder.createTriOp(spv::OpBitFieldUExtract, type_uint, biased_f32,
+                              builder.makeUintConstant(3),
+                              builder.makeUintConstant(1))),
+      builder.makeUintConstant(3), builder.makeUintConstant(24));
+}
+
+spv::Id SpirvShaderTranslator::Depth20e4To32(spv::Builder& builder,
+                                             spv::Id f24_uint_scalar,
+                                             uint32_t f24_shift,
+                                             bool remap_to_0_to_0_5,
+                                             bool result_as_uint,
+                                             spv::Id ext_inst_glsl_std_450) {
+  // CFloat24 from d3dref9.dll +
+  // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
+
+  assert_true(builder.isUintType(builder.getTypeId(f24_uint_scalar)));
+  assert_true(f24_shift <= (32 - 24));
+
+  uint32_t remap_bias = uint32_t(remap_to_0_to_0_5);
+
+  spv::Id type_bool = builder.makeBoolType();
+  spv::Id type_int = builder.makeIntType(32);
+  spv::Id type_uint = builder.makeUintType(32);
+
+  spv::Id f24_unbiased_exponent = builder.createTriOp(
+      spv::OpBitFieldUExtract, type_uint, f24_uint_scalar,
+      builder.makeUintConstant(f24_shift + 20), builder.makeUintConstant(4));
+  spv::Id f24_mantissa = builder.createTriOp(
+      spv::OpBitFieldUExtract, type_uint, f24_uint_scalar,
+      builder.makeUintConstant(f24_shift), builder.makeUintConstant(20));
+
+  // The denormal nonzero 20e4 case.
+  // denormal_mantissa_msb = findMSB(f24_mantissa)
+  spv::Id denormal_mantissa_msb;
+  {
+    spv::Instruction* denormal_mantissa_msb_instruction =
+        new spv::Instruction(builder.getUniqueId(), type_int, spv::OpExtInst);
+    denormal_mantissa_msb_instruction->addIdOperand(ext_inst_glsl_std_450);
+    denormal_mantissa_msb_instruction->addImmediateOperand(GLSLstd450FindUMsb);
+    denormal_mantissa_msb_instruction->addIdOperand(f24_mantissa);
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(denormal_mantissa_msb_instruction));
+    denormal_mantissa_msb = denormal_mantissa_msb_instruction->getResultId();
+  }
+  denormal_mantissa_msb =
+      builder.createUnaryOp(spv::OpBitcast, type_uint, denormal_mantissa_msb);
+  // denormal_f32_unbiased_exponent = 1 - (20 - findMSB(f24_mantissa))
+  // Or:
+  // denormal_f32_unbiased_exponent = findMSB(f24_mantissa) - 19
+  spv::Id denormal_f32_unbiased_exponent =
+      builder.createBinOp(spv::OpISub, type_uint, denormal_mantissa_msb,
+                          builder.makeUintConstant(19));
+  // Normalize the mantissa.
+  // denormal_f32_mantissa = f24_mantissa << (20 - findMSB(f24_mantissa))
+  spv::Id denormal_f32_mantissa = builder.createBinOp(
+      spv::OpShiftLeftLogical, type_uint, f24_mantissa,
+      builder.createBinOp(spv::OpISub, type_uint, builder.makeUintConstant(20),
+                          denormal_mantissa_msb));
+  // If the 20e4 number is zero, make sure the float32 number is zero too.
+  spv::Id f24_mantissa_is_nonzero = builder.createBinOp(
+      spv::OpINotEqual, type_bool, f24_mantissa, builder.makeUintConstant(0));
+  // Set the unbiased exponent to -112 for zero - 112 will be added later,
+  // resulting in zero float32.
+  denormal_f32_unbiased_exponent = builder.createTriOp(
+      spv::OpSelect, type_uint, f24_mantissa_is_nonzero,
+      denormal_f32_unbiased_exponent,
+      builder.makeUintConstant(uint32_t(-int32_t(112 - remap_bias))));
+  denormal_f32_mantissa =
+      builder.createTriOp(spv::OpSelect, type_uint, f24_mantissa_is_nonzero,
+                          denormal_f32_mantissa, builder.makeUintConstant(0));
+
+  // Select the needed conversion depending on whether the number is normal.
+  spv::Id f24_is_normal =
+      builder.createBinOp(spv::OpINotEqual, type_bool, f24_unbiased_exponent,
+                          builder.makeUintConstant(0));
+  spv::Id f32_unbiased_exponent = builder.createTriOp(
+      spv::OpSelect, type_uint, f24_is_normal, f24_unbiased_exponent,
+      denormal_f32_unbiased_exponent);
+  spv::Id f32_mantissa =
+      builder.createTriOp(spv::OpSelect, type_uint, f24_is_normal, f24_mantissa,
+                          denormal_f32_mantissa);
+
+  // Bias the exponent and construct the build the float32 number.
+  spv::Id f32_shifted;
+  {
+    spv::Instruction* f32_insert_instruction = new spv::Instruction(
+        builder.getUniqueId(), type_uint, spv::OpBitFieldInsert);
+    f32_insert_instruction->addIdOperand(f32_mantissa);
+    f32_insert_instruction->addIdOperand(
+        builder.createBinOp(spv::OpIAdd, type_uint, f32_unbiased_exponent,
+                            builder.makeUintConstant(112 - remap_bias)));
+    f32_insert_instruction->addIdOperand(builder.makeUintConstant(20));
+    f32_insert_instruction->addIdOperand(builder.makeUintConstant(8));
+    builder.getBuildPoint()->addInstruction(
+        std::unique_ptr<spv::Instruction>(f32_insert_instruction));
+    f32_shifted = f32_insert_instruction->getResultId();
+  }
+  spv::Id f32 =
+      builder.createBinOp(spv::OpShiftLeftLogical, type_uint, f32_shifted,
+                          builder.makeUintConstant(23 - 20));
+
+  if (!result_as_uint) {
+    f32 = builder.createUnaryOp(spv::OpBitcast, builder.makeFloatType(32), f32);
+  }
+
+  return f32;
+}
+
+}  // namespace gpu
+}  // namespace xe
--- a/src/xenia/gpu/vulkan/deferred_command_buffer.cc
+++ b/src/xenia/gpu/vulkan/deferred_command_buffer.cc
@ -103,6 +103,37 @@ void DeferredCommandBuffer::Execute(VkCommandBuffer command_buffer) {
                              args.pipeline);
      } break;

+      case Command::kVkBindVertexBuffers: {
+        auto& args = *reinterpret_cast<const ArgsVkBindVertexBuffers*>(stream);
+        size_t offset_bytes =
+            xe::align(sizeof(ArgsVkBindVertexBuffers), alignof(VkBuffer));
+        const VkBuffer* buffers = reinterpret_cast<const VkBuffer*>(
+            reinterpret_cast<const uint8_t*>(stream) + offset_bytes);
+        offset_bytes =
+            xe::align(offset_bytes + sizeof(VkBuffer) * args.binding_count,
+                      alignof(VkDeviceSize));
+        const VkDeviceSize* offsets = reinterpret_cast<const VkDeviceSize*>(
+            reinterpret_cast<const uint8_t*>(stream) + offset_bytes);
+        dfn.vkCmdBindVertexBuffers(command_buffer, args.first_binding,
+                                   args.binding_count, buffers, offsets);
+      } break;
+
+      case Command::kVkClearAttachments: {
+        auto& args = *reinterpret_cast<const ArgsVkClearAttachments*>(stream);
+        size_t offset_bytes = xe::align(sizeof(ArgsVkClearAttachments),
+                                        alignof(VkClearAttachment));
+        const VkClearAttachment* attachments =
+            reinterpret_cast<const VkClearAttachment*>(
+                reinterpret_cast<const uint8_t*>(stream) + offset_bytes);
+        offset_bytes = xe::align(
+            offset_bytes + sizeof(VkClearAttachment) * args.attachment_count,
+            alignof(VkClearRect));
+        const VkClearRect* rects = reinterpret_cast<const VkClearRect*>(
+            reinterpret_cast<const uint8_t*>(stream) + offset_bytes);
+        dfn.vkCmdClearAttachments(command_buffer, args.attachment_count,
+                                  attachments, args.rect_count, rects);
+      } break;
+
      case Command::kVkCopyBuffer: {
        auto& args = *reinterpret_cast<const ArgsVkCopyBuffer*>(stream);
        dfn.vkCmdCopyBuffer(
@ -112,6 +143,12 @@ void DeferredCommandBuffer::Execute(VkCommandBuffer command_buffer) {
                xe::align(sizeof(ArgsVkCopyBuffer), alignof(VkBufferCopy))));
      } break;

+      case Command::kVkDispatch: {
+        auto& args = *reinterpret_cast<const ArgsVkDispatch*>(stream);
+        dfn.vkCmdDispatch(command_buffer, args.group_count_x,
+                          args.group_count_y, args.group_count_z);
+      } break;
+
      case Command::kVkDraw: {
        auto& args = *reinterpret_cast<const ArgsVkDraw*>(stream);
        dfn.vkCmdDraw(command_buffer, args.vertex_count, args.instance_count,
@ -168,6 +205,14 @@ void DeferredCommandBuffer::Execute(VkCommandBuffer command_buffer) {
            args.image_memory_barrier_count, image_memory_barriers);
      } break;

+      case Command::kVkPushConstants: {
+        auto& args = *reinterpret_cast<const ArgsVkPushConstants*>(stream);
+        dfn.vkCmdPushConstants(command_buffer, args.layout, args.stage_flags,
+                               args.offset, args.size,
+                               reinterpret_cast<const uint8_t*>(stream) +
+                                   sizeof(ArgsVkPushConstants));
+      } break;
+
      case Command::kVkSetBlendConstants: {
        auto& args = *reinterpret_cast<const ArgsVkSetBlendConstants*>(stream);
        dfn.vkCmdSetBlendConstants(command_buffer, args.blend_constants);
--- a/src/xenia/gpu/vulkan/deferred_command_buffer.h
+++ b/src/xenia/gpu/vulkan/deferred_command_buffer.h
@ -108,6 +108,61 @@ class DeferredCommandBuffer {
    args.pipeline = pipeline;
  }

+  void CmdVkBindVertexBuffers(uint32_t first_binding, uint32_t binding_count,
+                              const VkBuffer* buffers,
+                              const VkDeviceSize* offsets) {
+    size_t arguments_size =
+        xe::align(sizeof(ArgsVkBindVertexBuffers), alignof(VkBuffer));
+    size_t buffers_offset = arguments_size;
+    arguments_size =
+        xe::align(arguments_size + sizeof(VkBuffer) * binding_count,
+                  alignof(VkDeviceSize));
+    size_t offsets_offset = arguments_size;
+    arguments_size += sizeof(VkDeviceSize) * binding_count;
+    uint8_t* args_ptr = reinterpret_cast<uint8_t*>(
+        WriteCommand(Command::kVkBindVertexBuffers, arguments_size));
+    auto& args = *reinterpret_cast<ArgsVkBindVertexBuffers*>(args_ptr);
+    args.first_binding = first_binding;
+    args.binding_count = binding_count;
+    std::memcpy(args_ptr + buffers_offset, buffers,
+                sizeof(VkBuffer) * binding_count);
+    std::memcpy(args_ptr + offsets_offset, offsets,
+                sizeof(VkDeviceSize) * binding_count);
+  }
+
+  void CmdClearAttachmentsEmplace(uint32_t attachment_count,
+                                  VkClearAttachment*& attachments_out,
+                                  uint32_t rect_count,
+                                  VkClearRect*& rects_out) {
+    size_t arguments_size =
+        xe::align(sizeof(ArgsVkClearAttachments), alignof(VkClearAttachment));
+    size_t attachments_offset = arguments_size;
+    arguments_size =
+        xe::align(arguments_size + sizeof(VkClearAttachment) * attachment_count,
+                  alignof(VkClearRect));
+    size_t rects_offset = arguments_size;
+    arguments_size += sizeof(VkClearRect) * rect_count;
+    uint8_t* args_ptr = reinterpret_cast<uint8_t*>(
+        WriteCommand(Command::kVkClearAttachments, arguments_size));
+    auto& args = *reinterpret_cast<ArgsVkClearAttachments*>(args_ptr);
+    args.attachment_count = attachment_count;
+    args.rect_count = rect_count;
+    attachments_out =
+        reinterpret_cast<VkClearAttachment*>(args_ptr + attachments_offset);
+    rects_out = reinterpret_cast<VkClearRect*>(args_ptr + rects_offset);
+  }
+  void CmdVkClearAttachments(uint32_t attachment_count,
+                             const VkClearAttachment* attachments,
+                             uint32_t rect_count, const VkClearRect* rects) {
+    VkClearAttachment* attachments_arg;
+    VkClearRect* rects_arg;
+    CmdClearAttachmentsEmplace(attachment_count, attachments_arg, rect_count,
+                               rects_arg);
+    std::memcpy(attachments_arg, attachments,
+                sizeof(VkClearAttachment) * attachment_count);
+    std::memcpy(rects_arg, rects, sizeof(VkClearRect) * rect_count);
+  }
+
  VkBufferCopy* CmdCopyBufferEmplace(VkBuffer src_buffer, VkBuffer dst_buffer,
                                     uint32_t region_count) {
    const size_t header_size =
@ -127,6 +182,15 @@ class DeferredCommandBuffer {
                regions, sizeof(VkBufferCopy) * region_count);
  }

+  void CmdVkDispatch(uint32_t group_count_x, uint32_t group_count_y,
+                     uint32_t group_count_z) {
+    auto& args = *reinterpret_cast<ArgsVkDispatch*>(
+        WriteCommand(Command::kVkDispatch, sizeof(ArgsVkDispatch)));
+    args.group_count_x = group_count_x;
+    args.group_count_y = group_count_y;
+    args.group_count_z = group_count_z;
+  }
+
  void CmdVkDraw(uint32_t vertex_count, uint32_t instance_count,
                 uint32_t first_vertex, uint32_t first_instance) {
    auto& args = *reinterpret_cast<ArgsVkDraw*>(
@ -162,6 +226,19 @@ class DeferredCommandBuffer {
                            uint32_t image_memory_barrier_count,
                            const VkImageMemoryBarrier* image_memory_barriers);

+  void CmdVkPushConstants(VkPipelineLayout layout,
+                          VkShaderStageFlags stage_flags, uint32_t offset,
+                          uint32_t size, const void* values) {
+    uint8_t* args_ptr = reinterpret_cast<uint8_t*>(WriteCommand(
+        Command::kVkPushConstants, sizeof(ArgsVkPushConstants) + size));
+    auto& args = *reinterpret_cast<ArgsVkPushConstants*>(args_ptr);
+    args.layout = layout;
+    args.stage_flags = stage_flags;
+    args.offset = offset;
+    args.size = size;
+    std::memcpy(args_ptr + sizeof(ArgsVkPushConstants), values, size);
+  }
+
  void CmdVkSetBlendConstants(const float* blend_constants) {
    auto& args = *reinterpret_cast<ArgsVkSetBlendConstants*>(WriteCommand(
        Command::kVkSetBlendConstants, sizeof(ArgsVkSetBlendConstants)));
@ -237,11 +314,15 @@ class DeferredCommandBuffer {
    kVkBindDescriptorSets,
    kVkBindIndexBuffer,
    kVkBindPipeline,
+    kVkBindVertexBuffers,
+    kVkClearAttachments,
    kVkCopyBuffer,
+    kVkDispatch,
    kVkDraw,
    kVkDrawIndexed,
    kVkEndRenderPass,
    kVkPipelineBarrier,
+    kVkPushConstants,
    kVkSetBlendConstants,
    kVkSetDepthBias,
    kVkSetScissor,
@ -289,6 +370,22 @@ class DeferredCommandBuffer {
    VkPipeline pipeline;
  };

+  struct ArgsVkBindVertexBuffers {
+    uint32_t first_binding;
+    uint32_t binding_count;
+    // Followed by aligned VkBuffer[], VkDeviceSize[].
+    static_assert(alignof(VkBuffer) <= alignof(uintmax_t));
+    static_assert(alignof(VkDeviceSize) <= alignof(uintmax_t));
+  };
+
+  struct ArgsVkClearAttachments {
+    uint32_t attachment_count;
+    uint32_t rect_count;
+    // Followed by aligned VkClearAttachment[], VkClearRect[].
+    static_assert(alignof(VkClearAttachment) <= alignof(uintmax_t));
+    static_assert(alignof(VkClearRect) <= alignof(uintmax_t));
+  };
+
  struct ArgsVkCopyBuffer {
    VkBuffer src_buffer;
    VkBuffer dst_buffer;
@ -297,6 +394,12 @@ class DeferredCommandBuffer {
    static_assert(alignof(VkBufferCopy) <= alignof(uintmax_t));
  };

+  struct ArgsVkDispatch {
+    uint32_t group_count_x;
+    uint32_t group_count_y;
+    uint32_t group_count_z;
+  };
+
  struct ArgsVkDraw {
    uint32_t vertex_count;
    uint32_t instance_count;
@ -326,6 +429,14 @@ class DeferredCommandBuffer {
    static_assert(alignof(VkImageMemoryBarrier) <= alignof(uintmax_t));
  };

+  struct ArgsVkPushConstants {
+    VkPipelineLayout layout;
+    VkShaderStageFlags stage_flags;
+    uint32_t offset;
+    uint32_t size;
+    // Followed by `size` bytes of values.
+  };
+
  struct ArgsVkSetBlendConstants {
    float blend_constants[4];
  };
--- a/src/xenia/gpu/vulkan/premake5.lua
+++ b/src/xenia/gpu/vulkan/premake5.lua
@ -8,6 +8,7 @@ project("xenia-gpu-vulkan")
  language("C++")
  links({
    "fmt",
+    "glslang-spirv",
    "xenia-base",
    "xenia-gpu",
    "xenia-ui",
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@ -476,7 +476,7 @@ bool VulkanCommandProcessor::SetupContext() {
  swap_pipeline_create_info.renderPass = swap_render_pass_;
  swap_pipeline_create_info.subpass = 0;
  swap_pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE;
-  swap_pipeline_create_info.basePipelineIndex = UINT32_MAX;
+  swap_pipeline_create_info.basePipelineIndex = -1;
  VkResult swap_pipeline_create_result = dfn.vkCreateGraphicsPipelines(
      device, VK_NULL_HANDLE, 1, &swap_pipeline_create_info, nullptr,
      &swap_pipeline_);
@ -810,8 +810,6 @@ void VulkanCommandProcessor::IssueSwap(uint32_t frontbuffer_ptr,
        deferred_command_buffer_.CmdVkBeginRenderPass(
            &render_pass_begin_info, VK_SUBPASS_CONTENTS_INLINE);

-        dynamic_viewport_update_needed_ = true;
-        dynamic_scissor_update_needed_ = true;
        VkViewport viewport;
        viewport.x = 0.0f;
        viewport.y = 0.0f;
@ -819,13 +817,13 @@ void VulkanCommandProcessor::IssueSwap(uint32_t frontbuffer_ptr,
        viewport.height = float(scaled_height);
        viewport.minDepth = 0.0f;
        viewport.maxDepth = 1.0f;
-        deferred_command_buffer_.CmdVkSetViewport(0, 1, &viewport);
-        VkRect2D scissor_rect;
-        scissor_rect.offset.x = 0;
-        scissor_rect.offset.y = 0;
-        scissor_rect.extent.width = scaled_width;
-        scissor_rect.extent.height = scaled_height;
-        deferred_command_buffer_.CmdVkSetScissor(0, 1, &scissor_rect);
+        SetViewport(viewport);
+        VkRect2D scissor;
+        scissor.offset.x = 0;
+        scissor.offset.y = 0;
+        scissor.extent.width = scaled_width;
+        scissor.extent.height = scaled_height;
+        SetScissor(scissor);

        BindExternalGraphicsPipeline(swap_pipeline_);

@ -856,7 +854,7 @@ void VulkanCommandProcessor::IssueSwap(uint32_t frontbuffer_ptr,
  EndSubmission(true);
 }

-void VulkanCommandProcessor::PushBufferMemoryBarrier(
+bool VulkanCommandProcessor::PushBufferMemoryBarrier(
    VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size,
    VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask,
    VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask,
@ -865,7 +863,7 @@ void VulkanCommandProcessor::PushBufferMemoryBarrier(
  if (skip_if_equal && src_stage_mask == dst_stage_mask &&
      src_access_mask == dst_access_mask &&
      src_queue_family_index == dst_queue_family_index) {
-    return;
+    return false;
  }

  // Separate different barriers for overlapping buffer ranges into different
@ -889,10 +887,10 @@ void VulkanCommandProcessor::PushBufferMemoryBarrier(
            src_queue_family_index &&
        other_buffer_memory_barrier.dstQueueFamilyIndex ==
            dst_queue_family_index) {
-      // The barrier is already present.
+      // The barrier is already pending.
      current_pending_barrier_.src_stage_mask |= src_stage_mask;
      current_pending_barrier_.dst_stage_mask |= dst_stage_mask;
-      return;
+      return true;
    }
    SplitPendingBarrier();
    break;
@ -911,9 +909,10 @@ void VulkanCommandProcessor::PushBufferMemoryBarrier(
  buffer_memory_barrier.buffer = buffer;
  buffer_memory_barrier.offset = offset;
  buffer_memory_barrier.size = size;
+  return true;
 }

-void VulkanCommandProcessor::PushImageMemoryBarrier(
+bool VulkanCommandProcessor::PushImageMemoryBarrier(
    VkImage image, const VkImageSubresourceRange& subresource_range,
    VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask,
    VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask,
@ -923,7 +922,7 @@ void VulkanCommandProcessor::PushImageMemoryBarrier(
  if (skip_if_equal && src_stage_mask == dst_stage_mask &&
      src_access_mask == dst_access_mask && old_layout == new_layout &&
      src_queue_family_index == dst_queue_family_index) {
-    return;
+    return false;
  }

  // Separate different barriers for overlapping image subresource ranges into
@ -969,10 +968,10 @@ void VulkanCommandProcessor::PushImageMemoryBarrier(
            src_queue_family_index &&
        other_image_memory_barrier.dstQueueFamilyIndex ==
            dst_queue_family_index) {
-      // The barrier is already present.
+      // The barrier is already pending.
      current_pending_barrier_.src_stage_mask |= src_stage_mask;
      current_pending_barrier_.dst_stage_mask |= dst_stage_mask;
-      return;
+      return true;
    }
    SplitPendingBarrier();
    break;
@ -992,6 +991,7 @@ void VulkanCommandProcessor::PushImageMemoryBarrier(
  image_memory_barrier.dstQueueFamilyIndex = dst_queue_family_index;
  image_memory_barrier.image = image;
  image_memory_barrier.subresourceRange = subresource_range;
+  return true;
 }

 bool VulkanCommandProcessor::SubmitBarriers(bool force_end_render_pass) {
@ -1257,6 +1257,53 @@ void VulkanCommandProcessor::BindExternalGraphicsPipeline(
  current_guest_graphics_pipeline_layout_ = VK_NULL_HANDLE;
 }

+void VulkanCommandProcessor::BindExternalComputePipeline(VkPipeline pipeline) {
+  if (current_external_compute_pipeline_ == pipeline) {
+    return;
+  }
+  deferred_command_buffer_.CmdVkBindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE,
+                                             pipeline);
+  current_external_compute_pipeline_ = pipeline;
+}
+
+void VulkanCommandProcessor::SetViewport(const VkViewport& viewport) {
+  if (!dynamic_viewport_update_needed_) {
+    dynamic_viewport_update_needed_ |= dynamic_viewport_.x != viewport.x;
+    dynamic_viewport_update_needed_ |= dynamic_viewport_.y != viewport.y;
+    dynamic_viewport_update_needed_ |=
+        dynamic_viewport_.width != viewport.width;
+    dynamic_viewport_update_needed_ |=
+        dynamic_viewport_.height != viewport.height;
+    dynamic_viewport_update_needed_ |=
+        dynamic_viewport_.minDepth != viewport.minDepth;
+    dynamic_viewport_update_needed_ |=
+        dynamic_viewport_.maxDepth != viewport.maxDepth;
+  }
+  if (dynamic_viewport_update_needed_) {
+    dynamic_viewport_ = viewport;
+    deferred_command_buffer_.CmdVkSetViewport(0, 1, &dynamic_viewport_);
+    dynamic_viewport_update_needed_ = false;
+  }
+}
+
+void VulkanCommandProcessor::SetScissor(const VkRect2D& scissor) {
+  if (!dynamic_scissor_update_needed_) {
+    dynamic_scissor_update_needed_ |=
+        dynamic_scissor_.offset.x != scissor.offset.x;
+    dynamic_scissor_update_needed_ |=
+        dynamic_scissor_.offset.y != scissor.offset.y;
+    dynamic_scissor_update_needed_ |=
+        dynamic_scissor_.extent.width != scissor.extent.width;
+    dynamic_scissor_update_needed_ |=
+        dynamic_scissor_.extent.height != scissor.extent.height;
+  }
+  if (dynamic_scissor_update_needed_) {
+    dynamic_scissor_ = scissor;
+    deferred_command_buffer_.CmdVkSetScissor(0, 1, &dynamic_scissor_);
+    dynamic_scissor_update_needed_ = false;
+  }
+}
+
 Shader* VulkanCommandProcessor::LoadShader(xenos::ShaderType shader_type,
                                           uint32_t guest_address,
                                           const uint32_t* host_address,
@ -1417,8 +1464,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
  }

  const ui::vulkan::VulkanProvider& provider = GetVulkanProvider();
-  const VkPhysicalDeviceProperties& device_properties =
-      provider.device_properties();
+  const VkPhysicalDeviceLimits& device_limits =
+      provider.device_properties().limits;

  // Get dynamic rasterizer state.
  draw_util::ViewportInfo viewport_info;
@ -1438,10 +1485,10 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
  // life. Or even disregard the viewport bounds range in the fragment shader
  // interlocks case completely - apply the viewport and the scissor offset
  // directly to pixel address and to things like ps_param_gen.
-  draw_util::GetHostViewportInfo(
-      regs, 1, 1, false, device_properties.limits.maxViewportDimensions[0],
-      device_properties.limits.maxViewportDimensions[1], true, false, false,
-      false, viewport_info);
+  draw_util::GetHostViewportInfo(regs, 1, 1, false,
+                                 device_limits.maxViewportDimensions[0],
+                                 device_limits.maxViewportDimensions[1], true,
+                                 false, false, false, viewport_info);

  // Update dynamic graphics pipeline state.
  UpdateDynamicState(viewport_info, primitive_polygonal);
@ -1675,6 +1722,8 @@ void VulkanCommandProcessor::CheckSubmissionFenceAndDeviceLoss(

  primitive_processor_->CompletedSubmissionUpdated();

+  render_target_cache_->CompletedSubmissionUpdated();
+
  // Destroy outdated swap objects.
  while (!swap_framebuffers_outdated_.empty()) {
    const auto& framebuffer_pair = swap_framebuffers_outdated_.front();
@ -1752,6 +1801,7 @@ bool VulkanCommandProcessor::BeginSubmission(bool is_guest_command) {
    current_framebuffer_ = nullptr;
    current_guest_graphics_pipeline_ = VK_NULL_HANDLE;
    current_external_graphics_pipeline_ = VK_NULL_HANDLE;
+    current_external_compute_pipeline_ = VK_NULL_HANDLE;
    current_guest_graphics_pipeline_layout_ = nullptr;
    current_graphics_descriptor_sets_bound_up_to_date_ = 0;

@ -1861,6 +1911,8 @@ bool VulkanCommandProcessor::EndSubmission(bool is_swap) {
  if (submission_open_) {
    EndRenderPass();

+    render_target_cache_->EndSubmission();
+
    primitive_processor_->EndSubmission();

    shared_memory_->EndSubmission();
@ -2112,20 +2164,7 @@ void VulkanCommandProcessor::UpdateDynamicState(
  }
  viewport.minDepth = viewport_info.z_min;
  viewport.maxDepth = viewport_info.z_max;
-  dynamic_viewport_update_needed_ |= dynamic_viewport_.x != viewport.x;
-  dynamic_viewport_update_needed_ |= dynamic_viewport_.y != viewport.y;
-  dynamic_viewport_update_needed_ |= dynamic_viewport_.width != viewport.width;
-  dynamic_viewport_update_needed_ |=
-      dynamic_viewport_.height != viewport.height;
-  dynamic_viewport_update_needed_ |=
-      dynamic_viewport_.minDepth != viewport.minDepth;
-  dynamic_viewport_update_needed_ |=
-      dynamic_viewport_.maxDepth != viewport.maxDepth;
-  if (dynamic_viewport_update_needed_) {
-    dynamic_viewport_ = viewport;
-    deferred_command_buffer_.CmdVkSetViewport(0, 1, &dynamic_viewport_);
-    dynamic_viewport_update_needed_ = false;
-  }
+  SetViewport(viewport);

  // Scissor.
  draw_util::Scissor scissor;
@ -2135,19 +2174,7 @@ void VulkanCommandProcessor::UpdateDynamicState(
  scissor_rect.offset.y = int32_t(scissor.offset[1]);
  scissor_rect.extent.width = scissor.extent[0];
  scissor_rect.extent.height = scissor.extent[1];
-  dynamic_scissor_update_needed_ |=
-      dynamic_scissor_.offset.x != scissor_rect.offset.x;
-  dynamic_scissor_update_needed_ |=
-      dynamic_scissor_.offset.y != scissor_rect.offset.y;
-  dynamic_scissor_update_needed_ |=
-      dynamic_scissor_.extent.width != scissor_rect.extent.width;
-  dynamic_scissor_update_needed_ |=
-      dynamic_scissor_.extent.height != scissor_rect.extent.height;
-  if (dynamic_scissor_update_needed_) {
-    dynamic_scissor_ = scissor_rect;
-    deferred_command_buffer_.CmdVkSetScissor(0, 1, &dynamic_scissor_);
-    dynamic_scissor_update_needed_ = false;
-  }
+  SetScissor(scissor_rect);

  // Depth bias.
  // TODO(Triang3l): Disable the depth bias for the fragment shader interlock RB
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -81,15 +81,16 @@ class VulkanCommandProcessor : public CommandProcessor {
  uint64_t GetCurrentFrame() const { return frame_current_; }
  uint64_t GetCompletedFrame() const { return frame_completed_; }

-  // Submission must be open to insert barriers.
-  void PushBufferMemoryBarrier(
+  // Submission must be open to insert barriers. Returning true if the barrier
+  // has actually been inserted and not dropped.
+  bool PushBufferMemoryBarrier(
      VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size,
      VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask,
      VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask,
      uint32_t src_queue_family_index = VK_QUEUE_FAMILY_IGNORED,
      uint32_t dst_queue_family_index = VK_QUEUE_FAMILY_IGNORED,
      bool skip_if_equal = true);
-  void PushImageMemoryBarrier(
+  bool PushImageMemoryBarrier(
      VkImage image, const VkImageSubresourceRange& subresource_range,
      VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask,
      VkAccessFlags src_access_mask, VkAccessFlags dst_access_mask,
@ -125,6 +126,9 @@ class VulkanCommandProcessor : public CommandProcessor {
                                    bool keep_dynamic_depth_bias = false,
                                    bool keep_dynamic_blend_constants = false,
                                    bool keep_dynamic_stencil_mask_ref = false);
+  void BindExternalComputePipeline(VkPipeline pipeline);
+  void SetViewport(const VkViewport& viewport);
+  void SetScissor(const VkRect2D& scissor);

 protected:
  bool SetupContext() override;
@ -211,6 +215,9 @@ class VulkanCommandProcessor : public CommandProcessor {
  // open non-frame submission, BeginSubmission(true) will promote it to a
  // frame. EndSubmission(true) will close the frame no matter whether the
  // submission has already been closed.
+  // Unlike on Direct3D 12, submission boundaries do not imply any memory
+  // barriers aside from an incoming host write (but not outgoing host read)
+  // dependency.

  // Rechecks submission number and reclaims per-submission resources. Pass 0 as
  // the submission to await to simply check status, or pass
@ -396,6 +403,7 @@ class VulkanCommandProcessor : public CommandProcessor {
  // TODO(Triang3l): Change to a deferred compilation handle.
  VkPipeline current_guest_graphics_pipeline_;
  VkPipeline current_external_graphics_pipeline_;
+  VkPipeline current_external_compute_pipeline_;

  // Pipeline layout of the current guest graphics pipeline.
  const PipelineLayout* current_guest_graphics_pipeline_layout_;
--- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc
@ -884,11 +884,25 @@ bool VulkanPipelineCache::EnsurePipelineCreated(
  // TODO(Triang3l): Wide lines.
  rasterization_state.lineWidth = 1.0f;

+  VkSampleMask sample_mask = UINT32_MAX;
  VkPipelineMultisampleStateCreateInfo multisample_state = {};
  multisample_state.sType =
      VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
-  multisample_state.rasterizationSamples = VkSampleCountFlagBits(
-      uint32_t(1) << uint32_t(description.render_pass_key.msaa_samples));
+  if (description.render_pass_key.msaa_samples == xenos::MsaaSamples::k2X &&
+      !render_target_cache_.IsMsaa2xSupported(
+          description.render_pass_key.depth_and_color_used != 0)) {
+    // Using sample 0 as 0 and 3 as 1 for 2x instead (not exactly the same
+    // sample locations, but still top-left and bottom-right - however, this can
+    // be adjusted with custom sample locations).
+    multisample_state.rasterizationSamples = VK_SAMPLE_COUNT_4_BIT;
+    sample_mask = 0b1001;
+    // TODO(Triang3l): Research sample mask behavior without attachments (in
+    // Direct3D, it's completely ignored in this case).
+    multisample_state.pSampleMask = &sample_mask;
+  } else {
+    multisample_state.rasterizationSamples = VkSampleCountFlagBits(
+        uint32_t(1) << uint32_t(description.render_pass_key.msaa_samples));
+  }

  VkPipelineDepthStencilStateCreateInfo depth_stencil_state = {};
  depth_stencil_state.sType =
@ -1061,7 +1075,7 @@ bool VulkanPipelineCache::EnsurePipelineCreated(
  pipeline_create_info.renderPass = creation_arguments.render_pass;
  pipeline_create_info.subpass = 0;
  pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE;
-  pipeline_create_info.basePipelineIndex = UINT32_MAX;
+  pipeline_create_info.basePipelineIndex = -1;

  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
  VkDevice device = provider.device();
--- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc
+++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc
--- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h
+++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h
@ -10,13 +10,20 @@
 #ifndef XENIA_GPU_VULKAN_VULKAN_RENDER_TARGET_CACHE_H_
 #define XENIA_GPU_VULKAN_VULKAN_RENDER_TARGET_CACHE_H_

+#include <array>
 #include <cstdint>
 #include <cstring>
+#include <functional>
+#include <memory>
 #include <unordered_map>

 #include "xenia/base/hash.h"
+#include "xenia/base/xxhash.h"
 #include "xenia/gpu/render_target_cache.h"
+#include "xenia/gpu/xenos.h"
+#include "xenia/ui/vulkan/single_layout_descriptor_set_pool.h"
 #include "xenia/ui/vulkan/vulkan_provider.h"
+#include "xenia/ui/vulkan/vulkan_upload_buffer_pool.h"

 namespace xe {
 namespace gpu {
@ -28,8 +35,12 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
 public:
  union RenderPassKey {
    struct {
-      // If emulating 2x as 4x, set this to 4x for 2x not to create unnecessary
-      // render pass objects.
+      // If emulating 2x as 4x, this is still 2x for simplicity of using this
+      // field to make guest-related decisions. Render pass objects are not very
+      // expensive, and their dependencies can't be shared between 2x-as-4x and
+      // true 4x MSAA passes (framebuffers because render target cache render
+      // targets are different for 2x and 4x guest MSAA, pipelines because the
+      // sample mask will have 2 samples excluded for 2x-as-4x).
      xenos::MsaaSamples msaa_samples : xenos::kMsaaSamplesBits;  // 2
      // << 0 is depth, << 1...4 is color.
      uint32_t depth_and_color_used : 1 + xenos::kMaxColorRenderTargets;  // 7
@ -46,7 +57,8 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
      xenos::ColorRenderTargetFormat color_2_view_format
          : xenos::kColorRenderTargetFormatBits;  // 20
      xenos::ColorRenderTargetFormat color_3_view_format
-          : xenos::kColorRenderTargetFormatBits;  // 24
+          : xenos::kColorRenderTargetFormatBits;    // 24
+      uint32_t color_rts_use_transfer_formats : 1;  // 25
    };
    uint32_t key = 0;
    struct Hasher {
@ -60,6 +72,9 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
    bool operator!=(const RenderPassKey& other_key) const {
      return !(*this == other_key);
    }
+    bool operator<(const RenderPassKey& other_key) const {
+      return key < other_key.key;
+    }
  };
  static_assert_size(RenderPassKey, sizeof(uint32_t));

@ -78,12 +93,14 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
  void Shutdown(bool from_destructor = false);
  void ClearCache() override;

-  // TOOD(Triang3l): Fragment shader interlock.
+  void CompletedSubmissionUpdated();
+  void EndSubmission();
+
+  // TODO(Triang3l): Fragment shader interlock.
  Path GetPath() const override { return Path::kHostRenderTargets; }

-  // TODO(Triang3l): Resolution scaling.
-  uint32_t GetResolutionScaleX() const override { return 1; }
-  uint32_t GetResolutionScaleY() const override { return 1; }
+  uint32_t GetResolutionScaleX() const override { return resolution_scale_x_; }
+  uint32_t GetResolutionScaleY() const override { return resolution_scale_y_; }

  bool Update(bool is_rasterization_done,
              uint32_t shader_writes_color_targets) override;
@ -98,6 +115,17 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
    return last_update_framebuffer_;
  }

+  bool msaa_2x_attachments_supported() const {
+    return msaa_2x_attachments_supported_;
+  }
+  bool msaa_2x_no_attachments_supported() const {
+    return msaa_2x_no_attachments_supported_;
+  }
+  bool IsMsaa2xSupported(bool subpass_has_attachments) const {
+    return subpass_has_attachments ? msaa_2x_attachments_supported_
+                                   : msaa_2x_no_attachments_supported_;
+  }
+
  // Returns the render pass object, or VK_NULL_HANDLE if failed to create.
  // A render pass managed by the render target cache may be ended and resumed
  // at any time (to allow for things like copying and texture loading).
@ -110,6 +138,99 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
      bool* is_integer_out = nullptr) const;

 protected:
+  uint32_t GetMaxRenderTargetWidth() const override;
+  uint32_t GetMaxRenderTargetHeight() const override;
+
+  RenderTarget* CreateRenderTarget(RenderTargetKey key) override;
+
+  // TODO(Triang3l): Check actual unorm24 support.
+  bool IsHostDepthEncodingDifferent(
+      xenos::DepthRenderTargetFormat format) const override {
+    return true;
+  }
+
+ private:
+  enum class EdramBufferUsage {
+    // There's no need for combined fragment and compute usages.
+    // With host render targets, the usual usage sequence is as follows:
+    // - Optionally compute writes - host depth copy storing for EDRAM range
+    //   ownership transfers.
+    // - Optionally fragment reads - host depth copy storing for EDRAM range
+    //   ownership transfers.
+    // - Compute writes - copying from host render targets during resolving.
+    // - Compute reads - writing to the shared memory during resolving.
+    // With the render backend implementation based on fragment shader
+    // interlocks, it's:
+    // - Fragment reads and writes - depth / stencil and color operations.
+    // - Compute reads - writing to the shared memory during resolving.
+    // So, fragment reads and compute reads normally don't follow each other,
+    // and there's no need to amortize the cost of a read > read barrier in an
+    // exceptional situation by using a wider barrier in the normal scenario.
+
+    // Host depth copy storing.
+    kFragmentRead,
+    // Fragment shader interlock depth / stencil and color operations.
+    kFragmentReadWrite,
+    // Resolve - copying to the shared memory.
+    kComputeRead,
+    // Resolve - copying from host render targets.
+    kComputeWrite,
+    // Trace recording.
+    kTransferRead,
+    // Trace playback.
+    kTransferWrite,
+  };
+  enum class EdramBufferModificationStatus {
+    // The values are ordered by how strong the barrier conditions are.
+    // No uncommitted shader writes.
+    kUnmodified,
+    // Need to commit before the next fragment shader interlock usage with
+    // overlap.
+    kViaFragmentShaderInterlock,
+    // Need to commit before any next fragment shader interlock usage.
+    kViaUnordered,
+  };
+  static void GetEdramBufferUsageMasks(EdramBufferUsage usage,
+                                       VkPipelineStageFlags& stage_mask_out,
+                                       VkAccessFlags& access_mask_out);
+  void UseEdramBuffer(EdramBufferUsage new_usage);
+  void MarkEdramBufferModified(
+      EdramBufferModificationStatus modification_status =
+          EdramBufferModificationStatus::kViaUnordered);
+  void CommitEdramBufferShaderWrites(
+      EdramBufferModificationStatus commit_status =
+          EdramBufferModificationStatus::kViaFragmentShaderInterlock);
+
+  VulkanCommandProcessor& command_processor_;
+
+  uint32_t resolution_scale_x_ = 1;
+  uint32_t resolution_scale_y_ = 1;
+
+  // Accessible in fragment and compute shaders.
+  VkDescriptorSetLayout descriptor_set_layout_storage_buffer_ = VK_NULL_HANDLE;
+  VkDescriptorSetLayout descriptor_set_layout_sampled_image_ = VK_NULL_HANDLE;
+  VkDescriptorSetLayout descriptor_set_layout_sampled_image_x2_ =
+      VK_NULL_HANDLE;
+
+  std::unique_ptr<ui::vulkan::SingleLayoutDescriptorSetPool>
+      descriptor_set_pool_sampled_image_;
+  std::unique_ptr<ui::vulkan::SingleLayoutDescriptorSetPool>
+      descriptor_set_pool_sampled_image_x2_;
+
+  VkDeviceMemory edram_buffer_memory_ = VK_NULL_HANDLE;
+  VkBuffer edram_buffer_ = VK_NULL_HANDLE;
+  EdramBufferUsage edram_buffer_usage_;
+  EdramBufferModificationStatus edram_buffer_modification_status_ =
+      EdramBufferModificationStatus::kUnmodified;
+  VkDescriptorPool edram_storage_buffer_descriptor_pool_ = VK_NULL_HANDLE;
+  VkDescriptorSet edram_storage_buffer_descriptor_set_;
+
+  // RenderPassKey::key -> VkRenderPass.
+  // VK_NULL_HANDLE if failed to create.
+  std::unordered_map<uint32_t, VkRenderPass> render_passes_;
+
+  // For host render targets.
+
  // Can only be destroyed when framebuffers referencing it are destroyed!
  class VulkanRenderTarget final : public RenderTarget {
   public:
@ -131,27 +252,45 @@ class VulkanRenderTargetCache final : public RenderTargetCache {

    // Takes ownership of the Vulkan objects passed to the constructor.
    VulkanRenderTarget(RenderTargetKey key,
-                       const ui::vulkan::VulkanProvider& provider,
+                       VulkanRenderTargetCache& render_target_cache,
                       VkImage image, VkDeviceMemory memory,
                       VkImageView view_depth_color,
                       VkImageView view_depth_stencil, VkImageView view_stencil,
                       VkImageView view_srgb,
-                       VkImageView view_color_transfer_separate)
+                       VkImageView view_color_transfer_separate,
+                       size_t descriptor_set_index_transfer_source)
        : RenderTarget(key),
-          provider_(provider),
+          render_target_cache_(render_target_cache),
          image_(image),
          memory_(memory),
          view_depth_color_(view_depth_color),
          view_depth_stencil_(view_depth_stencil),
          view_stencil_(view_stencil),
          view_srgb_(view_srgb),
-          view_color_transfer_separate_(view_color_transfer_separate) {}
+          view_color_transfer_separate_(view_color_transfer_separate),
+          descriptor_set_index_transfer_source_(
+              descriptor_set_index_transfer_source) {}
    ~VulkanRenderTarget();

    VkImage image() const { return image_; }

    VkImageView view_depth_color() const { return view_depth_color_; }
    VkImageView view_depth_stencil() const { return view_depth_stencil_; }
+    VkImageView view_color_transfer_separate() const {
+      return view_color_transfer_separate_;
+    }
+    VkImageView view_color_transfer() const {
+      return view_color_transfer_separate_ != VK_NULL_HANDLE
+                 ? view_color_transfer_separate_
+                 : view_depth_color_;
+    }
+    VkDescriptorSet GetDescriptorSetTransferSource() const {
+      ui::vulkan::SingleLayoutDescriptorSetPool& descriptor_set_pool =
+          key().is_depth
+              ? *render_target_cache_.descriptor_set_pool_sampled_image_x2_
+              : *render_target_cache_.descriptor_set_pool_sampled_image_;
+      return descriptor_set_pool.Get(descriptor_set_index_transfer_source_);
+    }

    static void GetDrawUsage(bool is_depth,
                             VkPipelineStageFlags* stage_mask_out,
@ -185,8 +324,13 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
      current_layout_ = layout;
    }

+    uint32_t temporary_sort_index() const { return temporary_sort_index_; }
+    void SetTemporarySortIndex(uint32_t index) {
+      temporary_sort_index_ = index;
+    }
+
   private:
-    const ui::vulkan::VulkanProvider& provider_;
+    VulkanRenderTargetCache& render_target_cache_;

    VkImage image_;
    VkDeviceMemory memory_;
@ -200,30 +344,17 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
    VkImageView view_srgb_;
    VkImageView view_color_transfer_separate_;

+    // 2 sampled images for depth / stencil, 1 sampled image for color.
+    size_t descriptor_set_index_transfer_source_;
+
    VkPipelineStageFlags current_stage_mask_ = 0;
    VkAccessFlags current_access_mask_ = 0;
    VkImageLayout current_layout_ = VK_IMAGE_LAYOUT_UNDEFINED;
+
+    // Temporary storage for indices in operations like transfers and dumps.
+    uint32_t temporary_sort_index_ = 0;
  };

-  uint32_t GetMaxRenderTargetWidth() const override;
-  uint32_t GetMaxRenderTargetHeight() const override;
-
-  RenderTarget* CreateRenderTarget(RenderTargetKey key) override;
-
-  // TODO(Triang3l): Check actual unorm24 support.
-  bool IsHostDepthEncodingDifferent(
-      xenos::DepthRenderTargetFormat format) const override {
-    return true;
-  }
-
- private:
-  VulkanCommandProcessor& command_processor_;
-
-  // RenderPassKey::key -> VkRenderPass.
-  std::unordered_map<uint32_t, VkRenderPass> render_passes_;
-
-  // For host render targets.
-
  struct FramebufferKey {
    RenderPassKey render_pass_key;

@ -254,13 +385,276 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
    void Reset() { std::memset(this, 0, sizeof(*this)); }
  };

+  enum TransferUsedDescriptorSet : uint32_t {
+    // Ordered from the least to the most frequently changed.
+    kTransferUsedDescriptorSetHostDepthBuffer,
+    kTransferUsedDescriptorSetHostDepthStencilTextures,
+    kTransferUsedDescriptorSetDepthStencilTextures,
+    // Mutually exclusive with kTransferUsedDescriptorSetDepthStencilTextures.
+    kTransferUsedDescriptorSetColorTexture,
+
+    kTransferUsedDescriptorSetCount,
+
+    kTransferUsedDescriptorSetHostDepthBufferBit =
+        uint32_t(1) << kTransferUsedDescriptorSetHostDepthBuffer,
+    kTransferUsedDescriptorSetHostDepthStencilTexturesBit =
+        uint32_t(1) << kTransferUsedDescriptorSetHostDepthStencilTextures,
+    kTransferUsedDescriptorSetDepthStencilTexturesBit =
+        uint32_t(1) << kTransferUsedDescriptorSetDepthStencilTextures,
+    kTransferUsedDescriptorSetColorTextureBit =
+        uint32_t(1) << kTransferUsedDescriptorSetColorTexture,
+  };
+
+  // 32-bit push constants (for simplicity of size calculation and to avoid
+  // std140 packing issues).
+  enum TransferUsedPushConstantDword : uint32_t {
+    kTransferUsedPushConstantDwordHostDepthAddress,
+    kTransferUsedPushConstantDwordAddress,
+    // Changed 8 times per transfer.
+    kTransferUsedPushConstantDwordStencilMask,
+
+    kTransferUsedPushConstantDwordCount,
+
+    kTransferUsedPushConstantDwordHostDepthAddressBit =
+        uint32_t(1) << kTransferUsedPushConstantDwordHostDepthAddress,
+    kTransferUsedPushConstantDwordAddressBit =
+        uint32_t(1) << kTransferUsedPushConstantDwordAddress,
+    kTransferUsedPushConstantDwordStencilMaskBit =
+        uint32_t(1) << kTransferUsedPushConstantDwordStencilMask,
+  };
+
+  enum class TransferPipelineLayoutIndex {
+    kColor,
+    kDepth,
+    kColorToStencilBit,
+    kDepthToStencilBit,
+    kColorAndHostDepthTexture,
+    kColorAndHostDepthBuffer,
+    kDepthAndHostDepthTexture,
+    kDepthAndHostDepthBuffer,
+
+    kCount,
+  };
+
+  struct TransferPipelineLayoutInfo {
+    uint32_t used_descriptor_sets;
+    uint32_t used_push_constant_dwords;
+  };
+
+  static const TransferPipelineLayoutInfo
+      kTransferPipelineLayoutInfos[size_t(TransferPipelineLayoutIndex::kCount)];
+
+  enum class TransferMode : uint32_t {
+    kColorToDepth,
+    kColorToColor,
+
+    kDepthToDepth,
+    kDepthToColor,
+
+    kColorToStencilBit,
+    kDepthToStencilBit,
+
+    // Two-source modes, using the host depth if it, when converted to the guest
+    // format, matches what's in the owner source (not modified, keep host
+    // precision), or the guest data otherwise (significantly modified, possibly
+    // cleared). Stencil for FragStencilRef is always taken from the guest
+    // source.
+
+    kColorAndHostDepthToDepth,
+    // When using different source and destination depth formats.
+    kDepthAndHostDepthToDepth,
+
+    // If host depth is fetched, but it's the same image as the destination,
+    // it's copied to the EDRAM buffer (but since it's just a scratch buffer,
+    // with tiles laid out linearly with the same pitch as in the original
+    // render target; also no swapping of 40-sample columns as opposed to the
+    // host render target - this is done only for the color source) and fetched
+    // from there instead of the host depth texture.
+    kColorAndHostDepthCopyToDepth,
+    kDepthAndHostDepthCopyToDepth,
+
+    kCount,
+  };
+
+  enum class TransferOutput {
+    kColor,
+    kDepth,
+    kStencilBit,
+  };
+
+  struct TransferModeInfo {
+    TransferOutput output;
+    TransferPipelineLayoutIndex pipeline_layout;
+  };
+
+  static const TransferModeInfo kTransferModes[size_t(TransferMode::kCount)];
+
+  union TransferShaderKey {
+    uint32_t key;
+    struct {
+      xenos::MsaaSamples dest_msaa_samples : xenos::kMsaaSamplesBits;
+      uint32_t dest_color_rt_index : xenos::kColorRenderTargetIndexBits;
+      uint32_t dest_resource_format : xenos::kRenderTargetFormatBits;
+      xenos::MsaaSamples source_msaa_samples : xenos::kMsaaSamplesBits;
+      // Always 1x when the host depth is a copy from a buffer rather than an
+      // image, not to create the same pipeline for different MSAA sample counts
+      // as it doesn't matter in this case.
+      xenos::MsaaSamples host_depth_source_msaa_samples
+          : xenos::kMsaaSamplesBits;
+      uint32_t source_resource_format : xenos::kRenderTargetFormatBits;
+
+      // Last bits because this affects the pipeline layout - after sorting,
+      // only change it as fewer times as possible. Depth buffers have an
+      // additional stencil texture.
+      static_assert(size_t(TransferMode::kCount) <= (size_t(1) << 4));
+      TransferMode mode : 4;
+    };
+
+    TransferShaderKey() : key(0) { static_assert_size(*this, sizeof(key)); }
+
+    struct Hasher {
+      size_t operator()(const TransferShaderKey& key) const {
+        return std::hash<uint32_t>{}(key.key);
+      }
+    };
+    bool operator==(const TransferShaderKey& other_key) const {
+      return key == other_key.key;
+    }
+    bool operator!=(const TransferShaderKey& other_key) const {
+      return !(*this == other_key);
+    }
+    bool operator<(const TransferShaderKey& other_key) const {
+      return key < other_key.key;
+    }
+  };
+
+  struct TransferPipelineKey {
+    RenderPassKey render_pass_key;
+    TransferShaderKey shader_key;
+
+    TransferPipelineKey(RenderPassKey render_pass_key,
+                        TransferShaderKey shader_key)
+        : render_pass_key(render_pass_key), shader_key(shader_key) {}
+
+    struct Hasher {
+      size_t operator()(const TransferPipelineKey& key) const {
+        XXH3_state_t hash_state;
+        XXH3_64bits_reset(&hash_state);
+        XXH3_64bits_update(&hash_state, &key.render_pass_key,
+                           sizeof(key.render_pass_key));
+        XXH3_64bits_update(&hash_state, &key.shader_key,
+                           sizeof(key.shader_key));
+        return static_cast<size_t>(XXH3_64bits_digest(&hash_state));
+      }
+    };
+    bool operator==(const TransferPipelineKey& other_key) const {
+      return render_pass_key == other_key.render_pass_key &&
+             shader_key == other_key.shader_key;
+    }
+    bool operator!=(const TransferPipelineKey& other_key) const {
+      return !(*this == other_key);
+    }
+    bool operator<(const TransferPipelineKey& other_key) const {
+      if (render_pass_key != other_key.render_pass_key) {
+        return render_pass_key < other_key.render_pass_key;
+      }
+      return shader_key < other_key.shader_key;
+    }
+  };
+
+  union TransferAddressConstant {
+    uint32_t constant;
+    struct {
+      // All in tiles.
+      uint32_t dest_pitch : xenos::kEdramPitchTilesBits;
+      uint32_t source_pitch : xenos::kEdramPitchTilesBits;
+      // Safe to use 12 bits for signed difference - no ownership transfer can
+      // ever occur between render targets with EDRAM base >= 2048 as this would
+      // result in 0-length spans. 10 + 10 + 12 is exactly 32, any more bits,
+      // and more root 32-bit constants will be used.
+      // Destination base in tiles minus source base in tiles (not vice versa
+      // because this is a transform of the coordinate system, not addresses
+      // themselves).
+      // 0 for host_depth_source_is_copy (ignored in this case anyway as
+      // destination == source anyway).
+      int32_t source_to_dest : xenos::kEdramBaseTilesBits;
+    };
+    TransferAddressConstant() : constant(0) {
+      static_assert_size(*this, sizeof(constant));
+    }
+    bool operator==(const TransferAddressConstant& other_constant) const {
+      return constant == other_constant.constant;
+    }
+    bool operator!=(const TransferAddressConstant& other_constant) const {
+      return !(*this == other_constant);
+    }
+  };
+
+  struct TransferInvocation {
+    Transfer transfer;
+    TransferShaderKey shader_key;
+    TransferInvocation(const Transfer& transfer,
+                       const TransferShaderKey& shader_key)
+        : transfer(transfer), shader_key(shader_key) {}
+    bool operator<(const TransferInvocation& other_invocation) {
+      // TODO(Triang3l): See if it may be better to sort by the source in the
+      // first place, especially when reading the same data multiple times (like
+      // to write the stencil bits after depth) for better read locality.
+      // Sort by the shader key primarily to reduce pipeline state (context)
+      // switches.
+      if (shader_key != other_invocation.shader_key) {
+        return shader_key < other_invocation.shader_key;
+      }
+      // Host depth render targets are changed rarely if they exist, won't save
+      // many binding changes, ignore them for simplicity (their existence is
+      // caught by the shader key change).
+      assert_not_null(transfer.source);
+      assert_not_null(other_invocation.transfer.source);
+      uint32_t source_index =
+          static_cast<const VulkanRenderTarget*>(transfer.source)
+              ->temporary_sort_index();
+      uint32_t other_source_index = static_cast<const VulkanRenderTarget*>(
+                                        other_invocation.transfer.source)
+                                        ->temporary_sort_index();
+      if (source_index != other_source_index) {
+        return source_index < other_source_index;
+      }
+      return transfer.start_tiles < other_invocation.transfer.start_tiles;
+    }
+    bool CanBeMergedIntoOneDraw(
+        const TransferInvocation& other_invocation) const {
+      return shader_key == other_invocation.shader_key &&
+             transfer.AreSourcesSame(other_invocation.transfer);
+    }
+  };
+
  // Returns the framebuffer object, or VK_NULL_HANDLE if failed to create.
  const Framebuffer* GetFramebuffer(
      RenderPassKey render_pass_key, uint32_t pitch_tiles_at_32bpp,
      const RenderTarget* const* depth_and_color_render_targets);

+  VkShaderModule GetTransferShader(TransferShaderKey key);
+  // With sample-rate shading, returns a pointer to one pipeline. Without
+  // sample-rate shading, returns a pointer to as many pipelines as there are
+  // samples. If there was a failure to create a pipeline, returns nullptr.
+  VkPipeline const* GetTransferPipelines(TransferPipelineKey key);
+
+  // Do ownership transfers for render targets - each render target / vector may
+  // be null / empty in case there's nothing to do for them.
+  // resolve_clear_rectangle is expected to be provided by
+  // PrepareHostRenderTargetsResolveClear which should do all the needed size
+  // bound checks.
+  void PerformTransfersAndResolveClears(
+      uint32_t render_target_count, RenderTarget* const* render_targets,
+      const std::vector<Transfer>* render_target_transfers,
+      const uint64_t* render_target_resolve_clear_values = nullptr,
+      const Transfer::Rectangle* resolve_clear_rectangle = nullptr);
+
  bool gamma_render_target_as_srgb_ = false;

+  bool msaa_2x_attachments_supported_ = false;
+  bool msaa_2x_no_attachments_supported_ = false;
+
  std::unordered_map<FramebufferKey, Framebuffer, FramebufferKey::Hasher>
      framebuffers_;

@ -271,6 +665,32 @@ class VulkanRenderTargetCache final : public RenderTargetCache {
      last_update_framebuffer_attachments_[1 + xenos::kMaxColorRenderTargets] =
          {};
  const Framebuffer* last_update_framebuffer_ = VK_NULL_HANDLE;
+
+  // Set 0 - EDRAM storage buffer, set 1 - source depth sampled image (and
+  // unused stencil from the transfer descriptor set), HostDepthStoreConstants
+  // passed via push constants.
+  VkPipelineLayout host_depth_store_pipeline_layout_ = VK_NULL_HANDLE;
+  VkPipeline host_depth_store_pipelines_[size_t(xenos::MsaaSamples::k4X) + 1] =
+      {};
+
+  std::unique_ptr<ui::vulkan::VulkanUploadBufferPool>
+      transfer_vertex_buffer_pool_;
+  VkShaderModule transfer_passthrough_vertex_shader_ = VK_NULL_HANDLE;
+  VkPipelineLayout transfer_pipeline_layouts_[size_t(
+      TransferPipelineLayoutIndex::kCount)] = {};
+  // VK_NULL_HANDLE if failed to create.
+  std::unordered_map<TransferShaderKey, VkShaderModule,
+                     TransferShaderKey::Hasher>
+      transfer_shaders_;
+  // With sample-rate shading, one pipeline per entry. Without sample-rate
+  // shading, one pipeline per sample per entry. VK_NULL_HANDLE if failed to
+  // create.
+  std::unordered_map<TransferPipelineKey, std::array<VkPipeline, 4>,
+                     TransferPipelineKey::Hasher>
+      transfer_pipelines_;
+
+  // Temporary storage for PerformTransfersAndResolveClears.
+  std::vector<TransferInvocation> current_transfer_invocations_;
 };

 }  // namespace vulkan
--- a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
+++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc
@ -177,6 +177,10 @@ bool VulkanSharedMemory::Initialize() {
    }
  }

+  // The first usage will likely be uploading.
+  last_usage_ = Usage::kTransferDestination;
+  last_written_range_ = std::make_pair<uint32_t, uint32_t>(0, 0);
+
  upload_buffer_pool_ = std::make_unique<ui::vulkan::VulkanUploadBufferPool>(
      provider, VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
      xe::align(ui::vulkan::VulkanUploadBufferPool::kDefaultPageSize,
@ -190,9 +194,6 @@ void VulkanSharedMemory::Shutdown(bool from_destructor) {

  upload_buffer_pool_.reset();

-  last_written_range_ = std::make_pair<uint32_t, uint32_t>(0, 0);
-  last_usage_ = Usage::kTransferDestination;
-
  const ui::vulkan::VulkanProvider& provider =
      command_processor_.GetVulkanProvider();
  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
@ -226,8 +227,8 @@ void VulkanSharedMemory::Use(Usage usage,
  if (last_usage_ != usage || last_written_range_.second) {
    VkPipelineStageFlags src_stage_mask, dst_stage_mask;
    VkAccessFlags src_access_mask, dst_access_mask;
-    GetBarrier(last_usage_, src_stage_mask, src_access_mask);
-    GetBarrier(usage, dst_stage_mask, dst_access_mask);
+    GetUsageMasks(last_usage_, src_stage_mask, src_access_mask);
+    GetUsageMasks(usage, dst_stage_mask, dst_access_mask);
    VkDeviceSize offset, size;
    if (last_usage_ == usage) {
      // Committing the previous write, while not changing the access mask
@ -447,9 +448,9 @@ bool VulkanSharedMemory::UploadRanges(
  return successful;
 }

-void VulkanSharedMemory::GetBarrier(Usage usage,
-                                    VkPipelineStageFlags& stage_mask,
-                                    VkAccessFlags& access_mask) const {
+void VulkanSharedMemory::GetUsageMasks(Usage usage,
+                                       VkPipelineStageFlags& stage_mask,
+                                       VkAccessFlags& access_mask) const {
  switch (usage) {
    case Usage::kComputeWrite:
      stage_mask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
--- a/src/xenia/gpu/vulkan/vulkan_shared_memory.h
+++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.h
@ -47,8 +47,8 @@ class VulkanSharedMemory : public SharedMemory {
    kComputeWrite,
    kTransferDestination,
  };
-  // Places pipeline barrier for the target usage, also ensuring writes of
-  // adjacent are ordered with writes of each other and reads.
+  // Inserts a pipeline barrier for the target usage, also ensuring consecutive
+  // read-write accesses are ordered with each other.
  void Use(Usage usage, std::pair<uint32_t, uint32_t> written_range = {});

  VkBuffer buffer() const { return buffer_; }
@ -65,8 +65,8 @@ class VulkanSharedMemory : public SharedMemory {
                        upload_page_ranges) override;

 private:
-  void GetBarrier(Usage usage, VkPipelineStageFlags& stage_mask,
-                  VkAccessFlags& access_mask) const;
+  void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask,
+                     VkAccessFlags& access_mask) const;

  VulkanCommandProcessor& command_processor_;
  TraceWriter& trace_writer_;
@ -76,9 +76,8 @@ class VulkanSharedMemory : public SharedMemory {
  // Single for non-sparse, every allocation so far for sparse.
  std::vector<VkDeviceMemory> buffer_memory_;

-  // First usage will likely be uploading.
-  Usage last_usage_ = Usage::kTransferDestination;
-  std::pair<uint32_t, uint32_t> last_written_range_ = {};
+  Usage last_usage_;
+  std::pair<uint32_t, uint32_t> last_written_range_;

  std::unique_ptr<ui::vulkan::VulkanUploadBufferPool> upload_buffer_pool_;
  std::vector<VkBufferCopy> upload_regions_;
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -248,6 +248,7 @@ enum class MsaaSamples : uint32_t {

 constexpr uint32_t kMsaaSamplesBits = 2;

+constexpr uint32_t kColorRenderTargetIndexBits = 2;
 constexpr uint32_t kMaxColorRenderTargets = 4;

 enum class ColorRenderTargetFormat : uint32_t {
--- a/src/xenia/ui/vulkan/functions/device_1_0.inc
+++ b/src/xenia/ui/vulkan/functions/device_1_0.inc
@ -15,6 +15,7 @@ XE_UI_VULKAN_FUNCTION(vkCmdClearColorImage)
 XE_UI_VULKAN_FUNCTION(vkCmdCopyBuffer)
 XE_UI_VULKAN_FUNCTION(vkCmdCopyBufferToImage)
 XE_UI_VULKAN_FUNCTION(vkCmdCopyImageToBuffer)
+XE_UI_VULKAN_FUNCTION(vkCmdDispatch)
 XE_UI_VULKAN_FUNCTION(vkCmdDraw)
 XE_UI_VULKAN_FUNCTION(vkCmdDrawIndexed)
 XE_UI_VULKAN_FUNCTION(vkCmdEndRenderPass)
@ -29,6 +30,7 @@ XE_UI_VULKAN_FUNCTION(vkCmdSetStencilWriteMask)
 XE_UI_VULKAN_FUNCTION(vkCmdSetViewport)
 XE_UI_VULKAN_FUNCTION(vkCreateBuffer)
 XE_UI_VULKAN_FUNCTION(vkCreateCommandPool)
+XE_UI_VULKAN_FUNCTION(vkCreateComputePipelines)
 XE_UI_VULKAN_FUNCTION(vkCreateDescriptorPool)
 XE_UI_VULKAN_FUNCTION(vkCreateDescriptorSetLayout)
 XE_UI_VULKAN_FUNCTION(vkCreateFence)
--- a/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.cc
+++ b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.cc
@ -0,0 +1,120 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/ui/vulkan/single_layout_descriptor_set_pool.h"
+
+#include "xenia/base/assert.h"
+#include "xenia/base/logging.h"
+
+namespace xe {
+namespace ui {
+namespace vulkan {
+
+SingleLayoutDescriptorSetPool::SingleLayoutDescriptorSetPool(
+    const VulkanProvider& provider, uint32_t pool_set_count,
+    uint32_t set_layout_descriptor_counts_count,
+    const VkDescriptorPoolSize* set_layout_descriptor_counts,
+    VkDescriptorSetLayout set_layout)
+    : provider_(provider),
+      pool_set_count_(pool_set_count),
+      set_layout_(set_layout) {
+  assert_not_zero(pool_set_count);
+  pool_descriptor_counts_.resize(set_layout_descriptor_counts_count);
+  for (uint32_t i = 0; i < set_layout_descriptor_counts_count; ++i) {
+    VkDescriptorPoolSize& pool_descriptor_type_count =
+        pool_descriptor_counts_[i];
+    const VkDescriptorPoolSize& set_layout_descriptor_type_count =
+        set_layout_descriptor_counts[i];
+    pool_descriptor_type_count.type = set_layout_descriptor_type_count.type;
+    pool_descriptor_type_count.descriptorCount =
+        set_layout_descriptor_type_count.descriptorCount * pool_set_count;
+  }
+}
+
+SingleLayoutDescriptorSetPool::~SingleLayoutDescriptorSetPool() {
+  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider_.dfn();
+  VkDevice device = provider_.device();
+  if (current_pool_ != VK_NULL_HANDLE) {
+    dfn.vkDestroyDescriptorPool(device, current_pool_, nullptr);
+  }
+  for (VkDescriptorPool pool : full_pools_) {
+    dfn.vkDestroyDescriptorPool(device, pool, nullptr);
+  }
+}
+
+size_t SingleLayoutDescriptorSetPool::Allocate() {
+  if (!descriptor_sets_free_.empty()) {
+    size_t free_index = descriptor_sets_free_.back();
+    descriptor_sets_free_.pop_back();
+    return free_index;
+  }
+
+  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider_.dfn();
+  VkDevice device = provider_.device();
+
+  // Two iterations so if vkAllocateDescriptorSets fails even with a non-zero
+  // current_pool_sets_remaining_, another attempt will be made in a new pool.
+  for (uint32_t i = 0; i < 2; ++i) {
+    if (current_pool_ != VK_NULL_HANDLE && !current_pool_sets_remaining_) {
+      full_pools_.push_back(current_pool_);
+      current_pool_ = VK_NULL_HANDLE;
+    }
+    if (current_pool_ == VK_NULL_HANDLE) {
+      VkDescriptorPoolCreateInfo pool_create_info;
+      pool_create_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+      pool_create_info.pNext = nullptr;
+      pool_create_info.flags = 0;
+      pool_create_info.maxSets = pool_set_count_;
+      pool_create_info.poolSizeCount = uint32_t(pool_descriptor_counts_.size());
+      pool_create_info.pPoolSizes = pool_descriptor_counts_.data();
+      if (dfn.vkCreateDescriptorPool(device, &pool_create_info, nullptr,
+                                     &current_pool_) != VK_SUCCESS) {
+        XELOGE(
+            "SingleLayoutDescriptorSetPool: Failed to create a descriptor "
+            "pool");
+        return SIZE_MAX;
+      }
+      current_pool_sets_remaining_ = pool_set_count_;
+    }
+
+    VkDescriptorSetAllocateInfo descriptor_set_allocate_info;
+    descriptor_set_allocate_info.sType =
+        VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+    descriptor_set_allocate_info.pNext = nullptr;
+    descriptor_set_allocate_info.descriptorPool = current_pool_;
+    descriptor_set_allocate_info.descriptorSetCount = 1;
+    descriptor_set_allocate_info.pSetLayouts = &set_layout_;
+    VkDescriptorSet descriptor_set;
+    if (dfn.vkAllocateDescriptorSets(device, &descriptor_set_allocate_info,
+                                     &descriptor_set) != VK_SUCCESS) {
+      XELOGE(
+          "SingleLayoutDescriptorSetPool: Failed to allocate a descriptor "
+          "layout");
+      if (current_pool_sets_remaining_ >= pool_set_count_) {
+        // Failed to allocate in a new pool - something completely wrong, don't
+        // store empty pools as full.
+        dfn.vkDestroyDescriptorPool(device, current_pool_, nullptr);
+        current_pool_ = VK_NULL_HANDLE;
+        return SIZE_MAX;
+      }
+      full_pools_.push_back(current_pool_);
+      current_pool_ = VK_NULL_HANDLE;
+    }
+    --current_pool_sets_remaining_;
+    descriptor_sets_.push_back(descriptor_set);
+    return descriptor_sets_.size() - 1;
+  }
+
+  // Both attempts have failed.
+  return SIZE_MAX;
+}
+
+}  // namespace vulkan
+}  // namespace ui
+}  // namespace xe
--- a/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.h
+++ b/src/xenia/ui/vulkan/single_layout_descriptor_set_pool.h
@ -0,0 +1,63 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2022 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_UI_VULKAN_SINGLE_DESCRIPTOR_SET_POOL_H_
+#define XENIA_UI_VULKAN_SINGLE_DESCRIPTOR_SET_POOL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "xenia/base/assert.h"
+#include "xenia/ui/vulkan/vulkan_provider.h"
+
+namespace xe {
+namespace ui {
+namespace vulkan {
+
+class SingleLayoutDescriptorSetPool {
+ public:
+  // set_layout_descriptor_counts must contain the numbers of descriptors of
+  // each type in a single set with the layout (the multiplication by the pool
+  // set count will be done internally). The descriptor set layout must not be
+  // destroyed until this object is also destroyed.
+  SingleLayoutDescriptorSetPool(
+      const VulkanProvider& provider, uint32_t pool_set_count,
+      uint32_t set_layout_descriptor_counts_count,
+      const VkDescriptorPoolSize* set_layout_descriptor_counts,
+      VkDescriptorSetLayout set_layout);
+  ~SingleLayoutDescriptorSetPool();
+
+  // Returns SIZE_MAX in case of a failure.
+  size_t Allocate();
+  void Free(size_t index) {
+    assert_true(index < descriptor_sets_.size());
+    descriptor_sets_free_.push_back(index);
+  }
+  VkDescriptorSet Get(size_t index) const { return descriptor_sets_[index]; }
+
+ private:
+  const VulkanProvider& provider_;
+  uint32_t pool_set_count_;
+  std::vector<VkDescriptorPoolSize> pool_descriptor_counts_;
+  VkDescriptorSetLayout set_layout_;
+
+  std::vector<VkDescriptorPool> full_pools_;
+  VkDescriptorPool current_pool_ = VK_NULL_HANDLE;
+  uint32_t current_pool_sets_remaining_ = 0;
+
+  std::vector<VkDescriptorSet> descriptor_sets_;
+  std::vector<size_t> descriptor_sets_free_;
+};
+
+}  // namespace vulkan
+}  // namespace ui
+}  // namespace xe
+
+#endif  // XENIA_UI_VULKAN_SINGLE_DESCRIPTOR_SET_POOL_H_
--- a/src/xenia/ui/vulkan/vulkan_provider.cc
+++ b/src/xenia/ui/vulkan/vulkan_provider.cc
@ -715,6 +715,8 @@ bool VulkanProvider::Initialize() {
    static const std::pair<const char*, size_t> kUsedDeviceExtensions[] = {
        {"VK_EXT_fragment_shader_interlock",
         offsetof(DeviceExtensions, ext_fragment_shader_interlock)},
+        {"VK_EXT_shader_stencil_export",
+         offsetof(DeviceExtensions, ext_shader_stencil_export)},
        {"VK_KHR_dedicated_allocation",
         offsetof(DeviceExtensions, khr_dedicated_allocation)},
        {"VK_KHR_image_format_list",
@ -946,6 +948,8 @@ bool VulkanProvider::Initialize() {
  XELOGVK("Vulkan device extensions:");
  XELOGVK("* VK_EXT_fragment_shader_interlock: {}",
          device_extensions_.ext_fragment_shader_interlock ? "yes" : "no");
+  XELOGVK("* VK_EXT_shader_stencil_export: {}",
+          device_extensions_.ext_shader_stencil_export ? "yes" : "no");
  XELOGVK("* VK_KHR_dedicated_allocation: {}",
          device_extensions_.khr_dedicated_allocation ? "yes" : "no");
  XELOGVK("* VK_KHR_image_format_list: {}",
--- a/src/xenia/ui/vulkan/vulkan_provider.h
+++ b/src/xenia/ui/vulkan/vulkan_provider.h
@ -132,6 +132,7 @@ class VulkanProvider : public GraphicsProvider {
  }
  struct DeviceExtensions {
    bool ext_fragment_shader_interlock;
+    bool ext_shader_stencil_export;
    // Core since 1.1.0.
    bool khr_dedicated_allocation;
    // Core since 1.2.0.
--- a/src/xenia/ui/vulkan/vulkan_util.cc
+++ b/src/xenia/ui/vulkan/vulkan_util.cc
@ -189,6 +189,53 @@ bool CreateDedicatedAllocationImage(const VulkanProvider& provider,
  return true;
 }

+VkPipeline CreateComputePipeline(
+    const VulkanProvider& provider, VkPipelineLayout layout,
+    VkShaderModule shader, const VkSpecializationInfo* specialization_info,
+    const char* entry_point) {
+  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
+  VkDevice device = provider.device();
+  VkComputePipelineCreateInfo pipeline_create_info;
+  pipeline_create_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+  pipeline_create_info.pNext = nullptr;
+  pipeline_create_info.flags = 0;
+  pipeline_create_info.stage.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+  pipeline_create_info.stage.pNext = nullptr;
+  pipeline_create_info.stage.flags = 0;
+  pipeline_create_info.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+  pipeline_create_info.stage.module = shader;
+  pipeline_create_info.stage.pName = entry_point;
+  pipeline_create_info.stage.pSpecializationInfo = specialization_info;
+  pipeline_create_info.layout = layout;
+  pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE;
+  pipeline_create_info.basePipelineIndex = -1;
+  VkPipeline pipeline;
+  if (dfn.vkCreateComputePipelines(device, VK_NULL_HANDLE, 1,
+                                   &pipeline_create_info, nullptr,
+                                   &pipeline) != VK_SUCCESS) {
+    return VK_NULL_HANDLE;
+  }
+  return pipeline;
+}
+
+VkPipeline CreateComputePipeline(
+    const VulkanProvider& provider, VkPipelineLayout layout,
+    const uint32_t* shader_code, size_t shader_code_size_bytes,
+    const VkSpecializationInfo* specialization_info, const char* entry_point) {
+  VkShaderModule shader =
+      CreateShaderModule(provider, shader_code, shader_code_size_bytes);
+  if (shader == VK_NULL_HANDLE) {
+    return VK_NULL_HANDLE;
+  }
+  const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
+  VkDevice device = provider.device();
+  VkPipeline pipeline = CreateComputePipeline(provider, layout, shader,
+                                              specialization_info, entry_point);
+  dfn.vkDestroyShaderModule(device, shader, nullptr);
+  return pipeline;
+}
+
 }  // namespace util
 }  // namespace vulkan
 }  // namespace ui
--- a/src/xenia/ui/vulkan/vulkan_util.h
+++ b/src/xenia/ui/vulkan/vulkan_util.h
@ -164,6 +164,17 @@ inline VkShaderModule CreateShaderModule(const VulkanProvider& provider,
             : VK_NULL_HANDLE;
 }

+VkPipeline CreateComputePipeline(
+    const VulkanProvider& provider, VkPipelineLayout layout,
+    VkShaderModule shader,
+    const VkSpecializationInfo* specialization_info = nullptr,
+    const char* entry_point = "main");
+VkPipeline CreateComputePipeline(
+    const VulkanProvider& provider, VkPipelineLayout layout,
+    const uint32_t* shader_code, size_t shader_code_size_bytes,
+    const VkSpecializationInfo* specialization_info = nullptr,
+    const char* entry_point = "main");
+
 }  // namespace util
 }  // namespace vulkan
 }  // namespace ui