From 9fa41c27bc185a6b3a75196a811f6c3709506858 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Tue, 26 Jul 2022 16:01:20 +0300
Subject: [PATCH 1/5] [Vulkan] Point sprite geometry shader

---
 .../gpu/d3d12/d3d12_command_processor.cc      |  80 ++---
 src/xenia/gpu/spirv_shader_translator.cc      | 206 +++++++++--
 src/xenia/gpu/spirv_shader_translator.h       |  23 +-
 .../gpu/vulkan/vulkan_command_processor.cc    |  43 +++
 src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc | 331 +++++++++++++++++-
 src/xenia/gpu/vulkan/vulkan_pipeline_cache.h  |   2 +
 6 files changed, 591 insertions(+), 94 deletions(-)

diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index b92a61d77..b6f72ff9b 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -3160,8 +3160,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
   const RegisterFile& regs = *register_file_;
   auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
   auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
-  auto pa_su_point_minmax = regs.Get<reg::PA_SU_POINT_MINMAX>();
-  auto pa_su_point_size = regs.Get<reg::PA_SU_POINT_SIZE>();
   auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
   float rb_alpha_ref = regs[XE_GPU_REG_RB_ALPHA_REF].f32;
   auto rb_colorcontrol = regs.Get<reg::RB_COLORCONTROL>();
@@ -3365,43 +3363,47 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
   }
 
   // Point size.
-  float point_vertex_diameter_min =
-      float(pa_su_point_minmax.min_size) * (2.0f / 16.0f);
-  float point_vertex_diameter_max =
-      float(pa_su_point_minmax.max_size) * (2.0f / 16.0f);
-  float point_constant_diameter_x =
-      float(pa_su_point_size.width) * (2.0f / 16.0f);
-  float point_constant_diameter_y =
-      float(pa_su_point_size.height) * (2.0f / 16.0f);
-  dirty |=
-      system_constants_.point_vertex_diameter_min != point_vertex_diameter_min;
-  dirty |=
-      system_constants_.point_vertex_diameter_max != point_vertex_diameter_max;
-  dirty |=
-      system_constants_.point_constant_diameter[0] != point_constant_diameter_x;
-  dirty |=
-      system_constants_.point_constant_diameter[1] != point_constant_diameter_y;
-  system_constants_.point_vertex_diameter_min = point_vertex_diameter_min;
-  system_constants_.point_vertex_diameter_max = point_vertex_diameter_max;
-  system_constants_.point_constant_diameter[0] = point_constant_diameter_x;
-  system_constants_.point_constant_diameter[1] = point_constant_diameter_y;
-  // 2 because 1 in the NDC is half of the viewport's axis, 0.5 for diameter to
-  // radius conversion to avoid multiplying the per-vertex diameter by an
-  // additional constant in the shader.
-  float point_screen_diameter_to_ndc_radius_x =
-      (/* 0.5f * 2.0f * */ float(draw_resolution_scale_x)) /
-      std::max(viewport_info.xy_extent[0], uint32_t(1));
-  float point_screen_diameter_to_ndc_radius_y =
-      (/* 0.5f * 2.0f * */ float(draw_resolution_scale_y)) /
-      std::max(viewport_info.xy_extent[1], uint32_t(1));
-  dirty |= system_constants_.point_screen_diameter_to_ndc_radius[0] !=
-           point_screen_diameter_to_ndc_radius_x;
-  dirty |= system_constants_.point_screen_diameter_to_ndc_radius[1] !=
-           point_screen_diameter_to_ndc_radius_y;
-  system_constants_.point_screen_diameter_to_ndc_radius[0] =
-      point_screen_diameter_to_ndc_radius_x;
-  system_constants_.point_screen_diameter_to_ndc_radius[1] =
-      point_screen_diameter_to_ndc_radius_y;
+  if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
+    auto pa_su_point_minmax = regs.Get<reg::PA_SU_POINT_MINMAX>();
+    auto pa_su_point_size = regs.Get<reg::PA_SU_POINT_SIZE>();
+    float point_vertex_diameter_min =
+        float(pa_su_point_minmax.min_size) * (2.0f / 16.0f);
+    float point_vertex_diameter_max =
+        float(pa_su_point_minmax.max_size) * (2.0f / 16.0f);
+    float point_constant_diameter_x =
+        float(pa_su_point_size.width) * (2.0f / 16.0f);
+    float point_constant_diameter_y =
+        float(pa_su_point_size.height) * (2.0f / 16.0f);
+    dirty |= system_constants_.point_vertex_diameter_min !=
+             point_vertex_diameter_min;
+    dirty |= system_constants_.point_vertex_diameter_max !=
+             point_vertex_diameter_max;
+    dirty |= system_constants_.point_constant_diameter[0] !=
+             point_constant_diameter_x;
+    dirty |= system_constants_.point_constant_diameter[1] !=
+             point_constant_diameter_y;
+    system_constants_.point_vertex_diameter_min = point_vertex_diameter_min;
+    system_constants_.point_vertex_diameter_max = point_vertex_diameter_max;
+    system_constants_.point_constant_diameter[0] = point_constant_diameter_x;
+    system_constants_.point_constant_diameter[1] = point_constant_diameter_y;
+    // 2 because 1 in the NDC is half of the viewport's axis, 0.5 for diameter
+    // to radius conversion to avoid multiplying the per-vertex diameter by an
+    // additional constant in the shader.
+    float point_screen_diameter_to_ndc_radius_x =
+        (/* 0.5f * 2.0f * */ float(draw_resolution_scale_x)) /
+        std::max(viewport_info.xy_extent[0], uint32_t(1));
+    float point_screen_diameter_to_ndc_radius_y =
+        (/* 0.5f * 2.0f * */ float(draw_resolution_scale_y)) /
+        std::max(viewport_info.xy_extent[1], uint32_t(1));
+    dirty |= system_constants_.point_screen_diameter_to_ndc_radius[0] !=
+             point_screen_diameter_to_ndc_radius_x;
+    dirty |= system_constants_.point_screen_diameter_to_ndc_radius[1] !=
+             point_screen_diameter_to_ndc_radius_y;
+    system_constants_.point_screen_diameter_to_ndc_radius[0] =
+        point_screen_diameter_to_ndc_radius_x;
+    system_constants_.point_screen_diameter_to_ndc_radius[1] =
+        point_screen_diameter_to_ndc_radius_y;
+  }
 
   // Texture signedness / gamma.
   bool gamma_render_target_as_srgb =
diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index f7dc2c1f3..452ac1450 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -106,16 +106,19 @@ void SpirvShaderTranslator::Reset() {
 
   uniform_float_constants_ = spv::NoResult;
 
-  input_fragment_coord_ = spv::NoResult;
+  input_point_coordinates_ = spv::NoResult;
+  input_fragment_coordinates_ = spv::NoResult;
   input_front_facing_ = spv::NoResult;
   std::fill(input_output_interpolators_.begin(),
             input_output_interpolators_.end(), spv::NoResult);
+  output_point_size_ = spv::NoResult;
 
   sampler_bindings_.clear();
   texture_bindings_.clear();
 
   main_interface_.clear();
   var_main_registers_ = spv::NoResult;
+  var_main_point_size_edge_flag_kill_vertex_ = spv::NoResult;
 
   main_switch_op_.reset();
   main_switch_next_pc_phi_operands_.clear();
@@ -230,7 +233,16 @@ void SpirvShaderTranslator::StartTranslation() {
       {"vertex_base_index", offsetof(SystemConstants, vertex_base_index),
        type_int_},
       {"ndc_scale", offsetof(SystemConstants, ndc_scale), type_float3_},
+      {"point_vertex_diameter_min",
+       offsetof(SystemConstants, point_vertex_diameter_min), type_float_},
       {"ndc_offset", offsetof(SystemConstants, ndc_offset), type_float3_},
+      {"point_vertex_diameter_max",
+       offsetof(SystemConstants, point_vertex_diameter_max), type_float_},
+      {"point_constant_diameter",
+       offsetof(SystemConstants, point_constant_diameter), type_float2_},
+      {"point_screen_diameter_to_ndc_radius",
+       offsetof(SystemConstants, point_screen_diameter_to_ndc_radius),
+       type_float2_},
       {"texture_swizzled_signs",
        offsetof(SystemConstants, texture_swizzled_signs), type_uint4_array_2},
       {"texture_swizzles", offsetof(SystemConstants, texture_swizzles),
@@ -1063,9 +1075,10 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderBeforeMain() {
     main_interface_.push_back(input_vertex_index_);
   }
 
+  uint32_t output_location = 0;
+
   // Create the interpolator outputs.
   {
-    uint32_t interpolator_location = 0;
     uint32_t interpolators_remaining = GetModificationInterpolatorMask();
     uint32_t interpolator_index;
     while (xe::bit_scan_forward(interpolators_remaining, &interpolator_index)) {
@@ -1075,13 +1088,29 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderBeforeMain() {
           fmt::format("xe_out_interpolator_{}", interpolator_index).c_str());
       input_output_interpolators_[interpolator_index] = interpolator;
       builder_->addDecoration(interpolator, spv::DecorationLocation,
-                              int(interpolator_location));
+                              int(output_location));
       builder_->addDecoration(interpolator, spv::DecorationInvariant);
       main_interface_.push_back(interpolator);
-      ++interpolator_location;
+      ++output_location;
     }
   }
 
+  Modification shader_modification = GetSpirvShaderModification();
+
+  // Create the point size output. Not using gl_PointSize from gl_PerVertex not
+  // to rely on the shaderTessellationAndGeometryPointSize feature, and also
+  // because the value written to gl_PointSize must be greater than zero.
+  if (shader_modification.vertex.output_point_size) {
+    output_point_size_ =
+        builder_->createVariable(spv::NoPrecision, spv::StorageClassOutput,
+                                 type_float_, "xe_out_point_size");
+    builder_->addDecoration(output_point_size_, spv::DecorationLocation,
+                            int(output_location));
+    builder_->addDecoration(output_point_size_, spv::DecorationInvariant);
+    main_interface_.push_back(output_point_size_);
+    ++output_location;
+  }
+
   // Create the gl_PerVertex output for used system outputs.
   std::vector<spv::Id> struct_per_vertex_members;
   struct_per_vertex_members.reserve(kOutputPerVertexMemberCount);
@@ -1103,9 +1132,23 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderBeforeMain() {
 }
 
 void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() {
-  var_main_point_size_edge_flag_kill_vertex_ = builder_->createVariable(
-      spv::NoPrecision, spv::StorageClassFunction, type_float3_,
-      "xe_var_point_size_edge_flag_kill_vertex");
+  // The edge flag isn't used for any purpose by the translator.
+  if (current_shader().writes_point_size_edge_flag_kill_vertex() & 0b101) {
+    id_vector_temp_.clear();
+    id_vector_temp_.reserve(3);
+    // Set the point size to a negative value to tell the point sprite expansion
+    // that it should use the default point size if the vertex shader does not
+    // override it.
+    id_vector_temp_.push_back(builder_->makeFloatConstant(-1.0f));
+    // The edge flag is ignored.
+    id_vector_temp_.push_back(const_float_0_);
+    // Don't kill by default (zero bits 0:30).
+    id_vector_temp_.push_back(const_float_0_);
+    var_main_point_size_edge_flag_kill_vertex_ = builder_->createVariable(
+        spv::NoPrecision, spv::StorageClassFunction, type_float3_,
+        "xe_var_point_size_edge_flag_kill_vertex",
+        builder_->makeCompositeConstant(type_float3_, id_vector_temp_));
+  }
 
   // Zero general-purpose registers to prevent crashes when the game
   // references them after only initializing them conditionally.
@@ -1352,13 +1395,35 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() {
         std::move(composite_construct_op));
   }
   builder_->createStore(position, position_ptr);
+
+  // Write the point size.
+  if (output_point_size_ != spv::NoResult) {
+    spv::Id point_size;
+    if (current_shader().writes_point_size_edge_flag_kill_vertex() & 0b001) {
+      assert_true(var_main_point_size_edge_flag_kill_vertex_ != spv::NoResult);
+      id_vector_temp_.clear();
+      // X vector component.
+      id_vector_temp_.push_back(const_int_0_);
+      point_size = builder_->createLoad(
+          builder_->createAccessChain(
+              spv::StorageClassFunction,
+              var_main_point_size_edge_flag_kill_vertex_, id_vector_temp_),
+          spv::NoPrecision);
+    } else {
+      // Not statically overridden - write a negative value.
+      point_size = builder_->makeFloatConstant(-1.0f);
+    }
+    builder_->createStore(point_size, output_point_size_);
+  }
 }
 
 void SpirvShaderTranslator::StartFragmentShaderBeforeMain() {
-  // Interpolator inputs.
   Modification shader_modification = GetSpirvShaderModification();
+
+  uint32_t input_location = 0;
+
+  // Interpolator inputs.
   {
-    uint32_t interpolator_location = 0;
     uint32_t interpolators_remaining = GetModificationInterpolatorMask();
     uint32_t interpolator_index;
     while (xe::bit_scan_forward(interpolators_remaining, &interpolator_index)) {
@@ -1368,28 +1433,41 @@ void SpirvShaderTranslator::StartFragmentShaderBeforeMain() {
           fmt::format("xe_in_interpolator_{}", interpolator_index).c_str());
       input_output_interpolators_[interpolator_index] = interpolator;
       builder_->addDecoration(interpolator, spv::DecorationLocation,
-                              int(interpolator_location));
+                              int(input_location));
       if (shader_modification.pixel.interpolators_centroid &
           (UINT32_C(1) << interpolator_index)) {
         builder_->addDecoration(interpolator, spv::DecorationCentroid);
       }
       main_interface_.push_back(interpolator);
-      ++interpolator_location;
+      ++input_location;
     }
   }
 
   bool param_gen_needed = GetPsParamGenInterpolator() != UINT32_MAX;
 
+  // Point coordinate input.
+  if (shader_modification.pixel.param_gen_point) {
+    if (param_gen_needed) {
+      input_point_coordinates_ =
+          builder_->createVariable(spv::NoPrecision, spv::StorageClassInput,
+                                   type_float2_, "xe_in_point_coordinates");
+      builder_->addDecoration(input_point_coordinates_, spv::DecorationLocation,
+                              int(input_location));
+      main_interface_.push_back(input_point_coordinates_);
+    }
+    ++input_location;
+  }
+
   // Fragment coordinates.
   // TODO(Triang3l): More conditions - fragment shader interlock render backend,
   // alpha to coverage (if RT 0 is written, and there's no early depth /
   // stencil), depth writing in the fragment shader (per-sample if supported).
   if (param_gen_needed) {
-    input_fragment_coord_ = builder_->createVariable(
+    input_fragment_coordinates_ = builder_->createVariable(
         spv::NoPrecision, spv::StorageClassInput, type_float4_, "gl_FragCoord");
-    builder_->addDecoration(input_fragment_coord_, spv::DecorationBuiltIn,
+    builder_->addDecoration(input_fragment_coordinates_, spv::DecorationBuiltIn,
                             spv::BuiltInFragCoord);
-    main_interface_.push_back(input_fragment_coord_);
+    main_interface_.push_back(input_fragment_coordinates_);
   }
 
   // Is front facing.
@@ -1473,13 +1551,14 @@ void SpirvShaderTranslator::StartFragmentShaderInMain() {
     spv::Id const_sign_bit = builder_->makeUintConstant(UINT32_C(1) << 31);
     // TODO(Triang3l): Resolution scale inversion.
     // X - pixel X .0 in the magnitude, is back-facing in the sign bit.
-    assert_true(input_fragment_coord_ != spv::NoResult);
+    assert_true(input_fragment_coordinates_ != spv::NoResult);
     id_vector_temp_.clear();
     id_vector_temp_.push_back(const_int_0_);
-    spv::Id param_gen_x = builder_->createLoad(
-        builder_->createAccessChain(spv::StorageClassInput,
-                                    input_fragment_coord_, id_vector_temp_),
-        spv::NoPrecision);
+    spv::Id param_gen_x =
+        builder_->createLoad(builder_->createAccessChain(
+                                 spv::StorageClassInput,
+                                 input_fragment_coordinates_, id_vector_temp_),
+                             spv::NoPrecision);
     id_vector_temp_.clear();
     id_vector_temp_.push_back(param_gen_x);
     param_gen_x = builder_->createBuiltinCall(
@@ -1514,10 +1593,11 @@ void SpirvShaderTranslator::StartFragmentShaderInMain() {
     // Y - pixel Y .0 in the magnitude, is point in the sign bit.
     id_vector_temp_.clear();
     id_vector_temp_.push_back(builder_->makeIntConstant(1));
-    spv::Id param_gen_y = builder_->createLoad(
-        builder_->createAccessChain(spv::StorageClassInput,
-                                    input_fragment_coord_, id_vector_temp_),
-        spv::NoPrecision);
+    spv::Id param_gen_y =
+        builder_->createLoad(builder_->createAccessChain(
+                                 spv::StorageClassInput,
+                                 input_fragment_coordinates_, id_vector_temp_),
+                             spv::NoPrecision);
     id_vector_temp_.clear();
     id_vector_temp_.push_back(param_gen_y);
     param_gen_y = builder_->createBuiltinCall(
@@ -1535,10 +1615,16 @@ void SpirvShaderTranslator::StartFragmentShaderInMain() {
               const_sign_bit));
     }
     // Z - point S in the magnitude, is line in the sign bit.
-    spv::Id param_gen_z;
+    // W - point T in the magnitude.
+    spv::Id param_gen_z, param_gen_w;
     if (modification.pixel.param_gen_point) {
-      // TODO(Triang3l): Point coordinates.
-      param_gen_z = const_float_0_;
+      assert_true(input_point_coordinates_ != spv::NoResult);
+      spv::Id param_gen_point_coordinates =
+          builder_->createLoad(input_point_coordinates_, spv::NoPrecision);
+      param_gen_z = builder_->createCompositeExtract(
+          param_gen_point_coordinates, type_float_, 0);
+      param_gen_w = builder_->createCompositeExtract(
+          param_gen_point_coordinates, type_float_, 1);
     } else {
       param_gen_z = builder_->createUnaryOp(
           spv::OpBitcast, type_float_,
@@ -1552,10 +1638,8 @@ void SpirvShaderTranslator::StartFragmentShaderInMain() {
                       builder_->makeUintConstant(kSysFlag_PrimitiveLine)),
                   const_uint_0_),
               const_sign_bit, const_uint_0_));
+      param_gen_w = const_float_0_;
     }
-    // W - point T in the magnitude.
-    // TODO(Triang3l): Point coordinates.
-    spv::Id param_gen_w = const_float_0_;
     // Store the pixel parameters.
     id_vector_temp_.clear();
     id_vector_temp_.reserve(4);
@@ -1927,15 +2011,20 @@ void SpirvShaderTranslator::StoreResult(const InstructionResult& result,
       target_pointer = input_output_interpolators_[result.storage_index];
       // Unused interpolators are spv::NoResult in input_output_interpolators_.
     } break;
-    case InstructionStorageTarget::kPosition:
+    case InstructionStorageTarget::kPosition: {
       assert_true(is_vertex_shader());
       id_vector_temp_util_.clear();
       id_vector_temp_util_.push_back(
           builder_->makeIntConstant(kOutputPerVertexMemberPosition));
       target_pointer = builder_->createAccessChain(
           spv::StorageClassOutput, output_per_vertex_, id_vector_temp_util_);
-      break;
-    case InstructionStorageTarget::kColor:
+    } break;
+    case InstructionStorageTarget::kPointSizeEdgeFlagKillVertex: {
+      assert_true(is_vertex_shader());
+      assert_zero(used_write_mask & 0b1000);
+      target_pointer = var_main_point_size_edge_flag_kill_vertex_;
+    } break;
+    case InstructionStorageTarget::kColor: {
       assert_true(is_pixel_shader());
       assert_not_zero(used_write_mask);
       assert_true(current_shader().writes_color_target(result.storage_index));
@@ -1944,7 +2033,7 @@ void SpirvShaderTranslator::StoreResult(const InstructionResult& result,
       // an empty write mask without independent blending.
       // TODO(Triang3l): Store the alpha of the first output in this case for
       // alpha test and alpha to coverage.
-      break;
+    } break;
     default:
       // TODO(Triang3l): All storage targets.
       break;
@@ -2179,6 +2268,57 @@ void SpirvShaderTranslator::StoreResult(const InstructionResult& result,
       }
     }
   }
+
+  if (result.storage_target ==
+          InstructionStorageTarget::kPointSizeEdgeFlagKillVertex &&
+      used_write_mask & 0b001) {
+    // Make the point size non-negative as negative is used to indicate that the
+    // default size must be used, and also clamp it to the bounds the way the
+    // R400 (Adreno 200, to be more precise) hardware clamps it (functionally
+    // like a signed 32-bit integer, -NaN and -Infinity...-0 to the minimum,
+    // +NaN to the maximum).
+    spv::Id point_size = builder_->createUnaryOp(
+        spv::OpBitcast, type_int_,
+        builder_->createCompositeExtract(value_to_store, type_float_, 0));
+    id_vector_temp_util_.clear();
+    id_vector_temp_util_.push_back(
+        builder_->makeIntConstant(kSystemConstantPointVertexDiameterMin));
+    spv::Id point_vertex_diameter_min = builder_->createUnaryOp(
+        spv::OpBitcast, type_int_,
+        builder_->createLoad(
+            builder_->createAccessChain(spv::StorageClassUniform,
+                                        uniform_system_constants_,
+                                        id_vector_temp_util_),
+            spv::NoPrecision));
+    id_vector_temp_util_.clear();
+    id_vector_temp_util_.reserve(2);
+    id_vector_temp_util_.push_back(point_vertex_diameter_min);
+    id_vector_temp_util_.push_back(point_size);
+    point_size =
+        builder_->createBuiltinCall(type_int_, ext_inst_glsl_std_450_,
+                                    GLSLstd450SMax, id_vector_temp_util_);
+    id_vector_temp_util_.clear();
+    id_vector_temp_util_.push_back(
+        builder_->makeIntConstant(kSystemConstantPointVertexDiameterMax));
+    spv::Id point_vertex_diameter_max = builder_->createUnaryOp(
+        spv::OpBitcast, type_int_,
+        builder_->createLoad(
+            builder_->createAccessChain(spv::StorageClassUniform,
+                                        uniform_system_constants_,
+                                        id_vector_temp_util_),
+            spv::NoPrecision));
+    id_vector_temp_util_.clear();
+    id_vector_temp_util_.reserve(2);
+    id_vector_temp_util_.push_back(point_vertex_diameter_max);
+    id_vector_temp_util_.push_back(point_size);
+    point_size =
+        builder_->createBuiltinCall(type_int_, ext_inst_glsl_std_450_,
+                                    GLSLstd450SMin, id_vector_temp_util_);
+    value_to_store = builder_->createCompositeInsert(
+        builder_->createUnaryOp(spv::OpBitcast, type_float_, point_size),
+        value_to_store, type_float3_, 0);
+  }
+
   builder_->createStore(value_to_store, target_pointer);
 }
 
diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index 69d05d95c..733bbf2ff 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -34,7 +34,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
     // TODO(Triang3l): Change to 0xYYYYMMDD once it's out of the rapid
     // prototyping stage (easier to do small granular updates with an
     // incremental counter).
-    static constexpr uint32_t kVersion = 5;
+    static constexpr uint32_t kVersion = 6;
 
     enum class DepthStencilMode : uint32_t {
       kNoModifiers,
@@ -50,6 +50,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
       // Interpolators written by the vertex shader and needed by the pixel
       // shader.
       uint32_t interpolator_mask : xenos::kMaxInterpolators;
+      uint32_t output_point_size : 1;
       // Dynamically indexable register count from SQ_PROGRAM_CNTL.
       uint32_t dynamic_addressable_register_count : 8;
       // Pipeline stage and input configuration.
@@ -145,10 +146,15 @@ class SpirvShaderTranslator : public ShaderTranslator {
     int32_t vertex_base_index;
 
     float ndc_scale[3];
-    uint32_t padding_ndc_scale;
+    float point_vertex_diameter_min;
 
     float ndc_offset[3];
-    uint32_t padding_ndc_offset;
+    float point_vertex_diameter_max;
+
+    float point_constant_diameter[2];
+    // Diameter in guest screen coordinates > radius (0.5 * diameter) in the NDC
+    // for the host viewport.
+    float point_screen_diameter_to_ndc_radius[2];
 
     // Each byte contains post-swizzle TextureSign values for each of the needed
     // components of each of the 32 used texture fetch constants.
@@ -603,7 +609,11 @@ class SpirvShaderTranslator : public ShaderTranslator {
     kSystemConstantVertexIndexEndian,
     kSystemConstantVertexBaseIndex,
     kSystemConstantNdcScale,
+    kSystemConstantPointVertexDiameterMin,
     kSystemConstantNdcOffset,
+    kSystemConstantPointVertexDiameterMax,
+    kSystemConstantPointConstantDiameter,
+    kSystemConstantPointScreenDiameterToNdcRadius,
     kSystemConstantTextureSwizzledSigns,
     kSystemConstantTextureSwizzles,
     kSystemConstantAlphaTestReference,
@@ -627,8 +637,10 @@ class SpirvShaderTranslator : public ShaderTranslator {
   spv::Id input_vertex_index_;
   // VS as TES only - int.
   spv::Id input_primitive_id_;
+  // PS, only when needed - float2.
+  spv::Id input_point_coordinates_;
   // PS, only when needed - float4.
-  spv::Id input_fragment_coord_;
+  spv::Id input_fragment_coordinates_;
   // PS, only when needed - bool.
   spv::Id input_front_facing_;
 
@@ -643,6 +655,9 @@ class SpirvShaderTranslator : public ShaderTranslator {
   // all).
   std::array<spv::Id, xenos::kMaxInterpolators> input_output_interpolators_;
 
+  // VS, only when needed - float.
+  spv::Id output_point_size_;
+
   enum OutputPerVertexMember : unsigned int {
     kOutputPerVertexMemberPosition,
     kOutputPerVertexMemberCount,
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 3c4422561..80affe639 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -3482,6 +3482,49 @@ void VulkanCommandProcessor::UpdateSystemConstantValues(
     system_constants_.ndc_offset[i] = viewport_info.ndc_offset[i];
   }
 
+  // Point size.
+  if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
+    auto pa_su_point_minmax = regs.Get<reg::PA_SU_POINT_MINMAX>();
+    auto pa_su_point_size = regs.Get<reg::PA_SU_POINT_SIZE>();
+    float point_vertex_diameter_min =
+        float(pa_su_point_minmax.min_size) * (2.0f / 16.0f);
+    float point_vertex_diameter_max =
+        float(pa_su_point_minmax.max_size) * (2.0f / 16.0f);
+    float point_constant_diameter_x =
+        float(pa_su_point_size.width) * (2.0f / 16.0f);
+    float point_constant_diameter_y =
+        float(pa_su_point_size.height) * (2.0f / 16.0f);
+    dirty |= system_constants_.point_vertex_diameter_min !=
+             point_vertex_diameter_min;
+    dirty |= system_constants_.point_vertex_diameter_max !=
+             point_vertex_diameter_max;
+    dirty |= system_constants_.point_constant_diameter[0] !=
+             point_constant_diameter_x;
+    dirty |= system_constants_.point_constant_diameter[1] !=
+             point_constant_diameter_y;
+    system_constants_.point_vertex_diameter_min = point_vertex_diameter_min;
+    system_constants_.point_vertex_diameter_max = point_vertex_diameter_max;
+    system_constants_.point_constant_diameter[0] = point_constant_diameter_x;
+    system_constants_.point_constant_diameter[1] = point_constant_diameter_y;
+    // 2 because 1 in the NDC is half of the viewport's axis, 0.5 for diameter
+    // to radius conversion to avoid multiplying the per-vertex diameter by an
+    // additional constant in the shader.
+    float point_screen_diameter_to_ndc_radius_x =
+        (/* 0.5f * 2.0f * */ float(texture_cache_->draw_resolution_scale_x())) /
+        std::max(viewport_info.xy_extent[0], uint32_t(1));
+    float point_screen_diameter_to_ndc_radius_y =
+        (/* 0.5f * 2.0f * */ float(texture_cache_->draw_resolution_scale_y())) /
+        std::max(viewport_info.xy_extent[1], uint32_t(1));
+    dirty |= system_constants_.point_screen_diameter_to_ndc_radius[0] !=
+             point_screen_diameter_to_ndc_radius_x;
+    dirty |= system_constants_.point_screen_diameter_to_ndc_radius[1] !=
+             point_screen_diameter_to_ndc_radius_y;
+    system_constants_.point_screen_diameter_to_ndc_radius[0] =
+        point_screen_diameter_to_ndc_radius_x;
+    system_constants_.point_screen_diameter_to_ndc_radius[1] =
+        point_screen_diameter_to_ndc_radius_y;
+  }
+
   // Texture signedness / gamma.
   {
     uint32_t textures_remaining = used_texture_mask;
diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc
index df7156b08..7cf30e250 100644
--- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc
@@ -133,6 +133,11 @@ VulkanPipelineCache::GetCurrentVertexShaderModification(
 
   modification.vertex.interpolator_mask = interpolator_mask;
 
+  modification.vertex.output_point_size =
+      uint32_t((shader.writes_point_size_edge_flag_kill_vertex() & 0b001) &&
+               regs.Get<reg::VGT_DRAW_INITIATOR>().prim_type ==
+                   xenos::PrimitiveType::kPointList);
+
   return modification;
 }
 
@@ -284,6 +289,8 @@ bool VulkanPipelineCache::ConfigurePipeline(
   if (GetGeometryShaderKey(
           description.geometry_shader,
           SpirvShaderTranslator::Modification(vertex_shader->modification()),
+          SpirvShaderTranslator::Modification(
+              pixel_shader ? pixel_shader->modification() : 0),
           geometry_shader_key)) {
     geometry_shader = GetGeometryShader(geometry_shader_key);
     if (geometry_shader == VK_NULL_HANDLE) {
@@ -496,6 +503,7 @@ bool VulkanPipelineCache::GetCurrentStateDescription(
   PipelinePrimitiveTopology primitive_topology;
   switch (primitive_processing_result.host_primitive_type) {
     case xenos::PrimitiveType::kPointList:
+      geometry_shader = PipelineGeometryShader::kPointList;
       primitive_topology = PipelinePrimitiveTopology::kPointList;
       break;
     case xenos::PrimitiveType::kLineList:
@@ -815,6 +823,7 @@ bool VulkanPipelineCache::ArePipelineRequirementsMet(
 bool VulkanPipelineCache::GetGeometryShaderKey(
     PipelineGeometryShader geometry_shader_type,
     SpirvShaderTranslator::Modification vertex_shader_modification,
+    SpirvShaderTranslator::Modification pixel_shader_modification,
     GeometryShaderKey& key_out) {
   if (geometry_shader_type == PipelineGeometryShader::kNone) {
     return false;
@@ -831,10 +840,8 @@ bool VulkanPipelineCache::GetGeometryShaderKey(
       /* vertex_shader_modification.vertex.user_clip_plane_cull */ 0;
   key.has_vertex_kill_and =
       /* vertex_shader_modification.vertex.vertex_kill_and */ 0;
-  key.has_point_size =
-      /* vertex_shader_modification.vertex.output_point_size */ 0;
-  key.has_point_coordinates =
-      /* pixel_shader_modification.pixel.param_gen_point */ 0;
+  key.has_point_size = vertex_shader_modification.vertex.output_point_size;
+  key.has_point_coordinates = pixel_shader_modification.pixel.param_gen_point;
   key_out = key;
   return true;
 }
@@ -853,6 +860,13 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) {
   spv::ExecutionMode output_primitive_execution_mode = spv::ExecutionMode(0);
   uint32_t output_max_vertices = 0;
   switch (key.type) {
+    case PipelineGeometryShader::kPointList:
+      // Point to a strip of 2 triangles.
+      input_primitive_execution_mode = spv::ExecutionModeInputPoints;
+      input_primitive_vertex_count = 1;
+      output_primitive_execution_mode = spv::ExecutionModeOutputTriangleStrip;
+      output_max_vertices = 4;
+      break;
     case PipelineGeometryShader::kRectangleList:
       // Triangle to a strip of 2 triangles.
       input_primitive_execution_mode = spv::ExecutionModeTriangles;
@@ -901,6 +915,7 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) {
   spv::Id type_bool4 = builder.makeVectorType(type_bool, 4);
   spv::Id type_int = builder.makeIntType(32);
   spv::Id type_float = builder.makeFloatType(32);
+  spv::Id type_float2 = builder.makeVectorType(type_float, 2);
   spv::Id type_float4 = builder.makeVectorType(type_float, 4);
   spv::Id type_clip_distances =
       clip_distance_count
@@ -912,9 +927,54 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) {
           ? builder.makeArrayType(
                 type_float, builder.makeUintConstant(cull_distance_count), 0)
           : spv::NoType;
-  spv::Id type_point_coordinates = key.has_point_coordinates
-                                       ? builder.makeVectorType(type_float, 2)
-                                       : spv::NoType;
+
+  // System constants.
+  // For points:
+  // - float2 point_constant_diameter
+  // - float2 point_screen_diameter_to_ndc_radius
+  enum PointConstant : uint32_t {
+    kPointConstantConstantDiameter,
+    kPointConstantScreenDiameterToNdcRadius,
+    kPointConstantCount,
+  };
+  spv::Id type_system_constants = spv::NoType;
+  if (key.type == PipelineGeometryShader::kPointList) {
+    id_vector_temp.clear();
+    id_vector_temp.resize(kPointConstantCount);
+    id_vector_temp[kPointConstantConstantDiameter] = type_float2;
+    id_vector_temp[kPointConstantScreenDiameterToNdcRadius] = type_float2;
+    type_system_constants =
+        builder.makeStructType(id_vector_temp, "XeSystemConstants");
+    builder.addMemberName(type_system_constants, kPointConstantConstantDiameter,
+                          "point_constant_diameter");
+    builder.addMemberDecoration(
+        type_system_constants, kPointConstantConstantDiameter,
+        spv::DecorationOffset,
+        int(offsetof(SpirvShaderTranslator::SystemConstants,
+                     point_constant_diameter)));
+    builder.addMemberName(type_system_constants,
+                          kPointConstantScreenDiameterToNdcRadius,
+                          "point_screen_diameter_to_ndc_radius");
+    builder.addMemberDecoration(
+        type_system_constants, kPointConstantScreenDiameterToNdcRadius,
+        spv::DecorationOffset,
+        int(offsetof(SpirvShaderTranslator::SystemConstants,
+                     point_screen_diameter_to_ndc_radius)));
+  }
+  spv::Id uniform_system_constants = spv::NoResult;
+  if (type_system_constants != spv::NoType) {
+    builder.addDecoration(type_system_constants, spv::DecorationBlock);
+    uniform_system_constants = builder.createVariable(
+        spv::NoPrecision, spv::StorageClassUniform, type_system_constants,
+        "xe_uniform_system_constants");
+    builder.addDecoration(uniform_system_constants,
+                          spv::DecorationDescriptorSet,
+                          int(SpirvShaderTranslator::kDescriptorSetConstants));
+    builder.addDecoration(uniform_system_constants, spv::DecorationBinding,
+                          int(SpirvShaderTranslator::kConstantBufferSystem));
+    // Generating SPIR-V 1.0, no need to add bindings to the entry point's
+    // interface until SPIR-V 1.4.
+  }
 
   // Inputs and outputs - matching glslang order, in gl_PerVertex gl_in[],
   // user-defined outputs, user-defined inputs, out gl_PerVertex.
@@ -977,6 +1037,8 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) {
                              type_array_in_gl_per_vertex, "gl_in");
   main_interface.push_back(in_gl_per_vertex);
 
+  uint32_t output_location = 0;
+
   // Interpolators outputs.
   std::array<spv::Id, xenos::kMaxInterpolators> out_interpolators;
   for (uint32_t i = 0; i < key.interpolator_count; ++i) {
@@ -984,23 +1046,28 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) {
         spv::NoPrecision, spv::StorageClassOutput, type_float4,
         fmt::format("xe_out_interpolator_{}", i).c_str());
     out_interpolators[i] = out_interpolator;
-    builder.addDecoration(out_interpolator, spv::DecorationLocation, i);
+    builder.addDecoration(out_interpolator, spv::DecorationLocation,
+                          int(output_location));
     builder.addDecoration(out_interpolator, spv::DecorationInvariant);
     main_interface.push_back(out_interpolator);
+    ++output_location;
   }
 
   // Point coordinate output.
   spv::Id out_point_coordinates = spv::NoResult;
   if (key.has_point_coordinates) {
-    out_point_coordinates = builder.createVariable(
-        spv::NoPrecision, spv::StorageClassOutput, type_point_coordinates,
-        "xe_out_point_coordinates");
+    out_point_coordinates =
+        builder.createVariable(spv::NoPrecision, spv::StorageClassOutput,
+                               type_float2, "xe_out_point_coordinates");
     builder.addDecoration(out_point_coordinates, spv::DecorationLocation,
-                          key.interpolator_count);
+                          int(output_location));
     builder.addDecoration(out_point_coordinates, spv::DecorationInvariant);
     main_interface.push_back(out_point_coordinates);
+    ++output_location;
   }
 
+  uint32_t input_location = 0;
+
   // Interpolator inputs.
   std::array<spv::Id, xenos::kMaxInterpolators> in_interpolators;
   for (uint32_t i = 0; i < key.interpolator_count; ++i) {
@@ -1010,8 +1077,10 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) {
                               0),
         fmt::format("xe_in_interpolator_{}", i).c_str());
     in_interpolators[i] = in_interpolator;
-    builder.addDecoration(in_interpolator, spv::DecorationLocation, i);
+    builder.addDecoration(in_interpolator, spv::DecorationLocation,
+                          int(input_location));
     main_interface.push_back(in_interpolator);
+    ++input_location;
   }
 
   // Point size input.
@@ -1023,8 +1092,9 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) {
                               0),
         "xe_in_point_size");
     builder.addDecoration(in_point_size, spv::DecorationLocation,
-                          key.interpolator_count);
+                          int(input_location));
     main_interface.push_back(in_point_size);
+    ++input_location;
   }
 
   // out gl_PerVertex.
@@ -1198,6 +1268,231 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) {
   }
 
   switch (key.type) {
+    case PipelineGeometryShader::kPointList: {
+      // Expand the point sprite, with left-to-right, top-to-bottom UVs.
+
+      spv::Id const_int_0 = builder.makeIntConstant(0);
+      spv::Id const_int_1 = builder.makeIntConstant(1);
+      spv::Id const_float_0 = builder.makeFloatConstant(0.0f);
+
+      // Load the point diameter in guest pixels.
+      id_vector_temp.clear();
+      id_vector_temp.reserve(2);
+      id_vector_temp.push_back(
+          builder.makeIntConstant(int32_t(kPointConstantConstantDiameter)));
+      id_vector_temp.push_back(const_int_0);
+      spv::Id point_guest_diameter_x = builder.createLoad(
+          builder.createAccessChain(spv::StorageClassUniform,
+                                    uniform_system_constants, id_vector_temp),
+          spv::NoPrecision);
+      id_vector_temp.back() = const_int_1;
+      spv::Id point_guest_diameter_y = builder.createLoad(
+          builder.createAccessChain(spv::StorageClassUniform,
+                                    uniform_system_constants, id_vector_temp),
+          spv::NoPrecision);
+      if (key.has_point_size) {
+        // The vertex shader's header writes -1.0 to point_size by default, so
+        // any non-negative value means that it was overwritten by the
+        // translated vertex shader, and needs to be used instead of the
+        // constant size. The per-vertex diameter is already clamped in the
+        // vertex shader (combined with making it non-negative).
+        id_vector_temp.clear();
+        // 0 is the input primitive vertex index.
+        id_vector_temp.push_back(const_int_0);
+        spv::Id point_vertex_diameter = builder.createLoad(
+            builder.createAccessChain(spv::StorageClassInput, in_point_size,
+                                      id_vector_temp),
+            spv::NoPrecision);
+        spv::Id point_vertex_diameter_written =
+            builder.createBinOp(spv::OpFOrdGreaterThanEqual, type_bool,
+                                point_vertex_diameter, const_float_0);
+        point_guest_diameter_x = builder.createTriOp(
+            spv::OpSelect, type_float, point_vertex_diameter_written,
+            point_vertex_diameter, point_guest_diameter_x);
+        point_guest_diameter_y = builder.createTriOp(
+            spv::OpSelect, type_float, point_vertex_diameter_written,
+            point_vertex_diameter, point_guest_diameter_y);
+      }
+
+      // 4D5307F1 has zero-size snowflakes, drop them quicker, and also drop
+      // points with a constant size of zero since point lists may also be used
+      // as just "compute" with memexport.
+      spv::Id point_size_not_zero = builder.createBinOp(
+          spv::OpLogicalAnd, type_bool,
+          builder.createBinOp(spv::OpFOrdGreaterThan, type_bool,
+                              point_guest_diameter_x, const_float_0),
+          builder.createBinOp(spv::OpFOrdGreaterThan, type_bool,
+                              point_guest_diameter_y, const_float_0));
+      spv::Block& point_size_zero_predecessor = *builder.getBuildPoint();
+      spv::Block& point_size_zero_then_block = builder.makeNewBlock();
+      spv::Block& point_size_zero_merge_block = builder.makeNewBlock();
+      {
+        std::unique_ptr<spv::Instruction> selection_merge_op(
+            std::make_unique<spv::Instruction>(spv::OpSelectionMerge));
+        selection_merge_op->addIdOperand(point_size_zero_merge_block.getId());
+        selection_merge_op->addImmediateOperand(
+            spv::SelectionControlDontFlattenMask);
+        point_size_zero_predecessor.addInstruction(
+            std::move(selection_merge_op));
+      }
+      {
+        std::unique_ptr<spv::Instruction> branch_conditional_op(
+            std::make_unique<spv::Instruction>(spv::OpBranchConditional));
+        branch_conditional_op->addIdOperand(point_size_not_zero);
+        branch_conditional_op->addIdOperand(
+            point_size_zero_merge_block.getId());
+        branch_conditional_op->addIdOperand(point_size_zero_then_block.getId());
+        branch_conditional_op->addImmediateOperand(2);
+        branch_conditional_op->addImmediateOperand(1);
+        point_size_zero_predecessor.addInstruction(
+            std::move(branch_conditional_op));
+      }
+      point_size_zero_then_block.addPredecessor(&point_size_zero_predecessor);
+      point_size_zero_merge_block.addPredecessor(&point_size_zero_predecessor);
+      builder.setBuildPoint(&point_size_zero_then_block);
+      builder.createNoResultOp(spv::OpReturn);
+      builder.setBuildPoint(&point_size_zero_merge_block);
+
+      // Transform the diameter in the guest screen coordinates to radius in the
+      // normalized device coordinates, and then to the clip space by
+      // multiplying by W.
+      id_vector_temp.clear();
+      id_vector_temp.reserve(2);
+      id_vector_temp.push_back(builder.makeIntConstant(
+          int32_t(kPointConstantScreenDiameterToNdcRadius)));
+      id_vector_temp.push_back(const_int_0);
+      spv::Id point_radius_x = builder.createBinOp(
+          spv::OpFMul, type_float, point_guest_diameter_x,
+          builder.createLoad(builder.createAccessChain(spv::StorageClassUniform,
+                                                       uniform_system_constants,
+                                                       id_vector_temp),
+                             spv::NoPrecision));
+      builder.addDecoration(point_radius_x, spv::DecorationNoContraction);
+      id_vector_temp.back() = const_int_1;
+      spv::Id point_radius_y = builder.createBinOp(
+          spv::OpFMul, type_float, point_guest_diameter_y,
+          builder.createLoad(builder.createAccessChain(spv::StorageClassUniform,
+                                                       uniform_system_constants,
+                                                       id_vector_temp),
+                             spv::NoPrecision));
+      builder.addDecoration(point_radius_y, spv::DecorationNoContraction);
+      id_vector_temp.clear();
+      id_vector_temp.reserve(2);
+      // 0 is the input primitive vertex index.
+      id_vector_temp.push_back(const_int_0);
+      id_vector_temp.push_back(const_member_in_gl_per_vertex_position);
+      spv::Id point_position = builder.createLoad(
+          builder.createAccessChain(spv::StorageClassInput, in_gl_per_vertex,
+                                    id_vector_temp),
+          spv::NoPrecision);
+      spv::Id point_w =
+          builder.createCompositeExtract(point_position, type_float, 3);
+      point_radius_x =
+          builder.createBinOp(spv::OpFMul, type_float, point_radius_x, point_w);
+      builder.addDecoration(point_radius_x, spv::DecorationNoContraction);
+      point_radius_y =
+          builder.createBinOp(spv::OpFMul, type_float, point_radius_y, point_w);
+      builder.addDecoration(point_radius_y, spv::DecorationNoContraction);
+
+      // Load the inputs for the guest point.
+      // Interpolators.
+      std::array<spv::Id, xenos::kMaxInterpolators> point_interpolators;
+      id_vector_temp.clear();
+      // 0 is the input primitive vertex index.
+      id_vector_temp.push_back(const_int_0);
+      for (uint32_t i = 0; i < key.interpolator_count; ++i) {
+        point_interpolators[i] = builder.createLoad(
+            builder.createAccessChain(spv::StorageClassInput,
+                                      in_interpolators[i], id_vector_temp),
+            spv::NoPrecision);
+      }
+      // Positions.
+      spv::Id point_x =
+          builder.createCompositeExtract(point_position, type_float, 0);
+      spv::Id point_y =
+          builder.createCompositeExtract(point_position, type_float, 1);
+      std::array<spv::Id, 2> point_edge_x, point_edge_y;
+      for (uint32_t i = 0; i < 2; ++i) {
+        spv::Op point_radius_add_op = i ? spv::OpFAdd : spv::OpFSub;
+        point_edge_x[i] = builder.createBinOp(point_radius_add_op, type_float,
+                                              point_x, point_radius_x);
+        builder.addDecoration(point_edge_x[i], spv::DecorationNoContraction);
+        point_edge_y[i] = builder.createBinOp(point_radius_add_op, type_float,
+                                              point_y, point_radius_y);
+        builder.addDecoration(point_edge_y[i], spv::DecorationNoContraction);
+      };
+      spv::Id point_z =
+          builder.createCompositeExtract(point_position, type_float, 2);
+      // Clip distances.
+      spv::Id point_clip_distances = spv::NoResult;
+      if (clip_distance_count) {
+        id_vector_temp.clear();
+        id_vector_temp.reserve(2);
+        // 0 is the input primitive vertex index.
+        id_vector_temp.push_back(const_int_0);
+        id_vector_temp.push_back(const_member_in_gl_per_vertex_clip_distance);
+        point_clip_distances = builder.createLoad(
+            builder.createAccessChain(spv::StorageClassInput, in_gl_per_vertex,
+                                      id_vector_temp),
+            spv::NoPrecision);
+      }
+
+      for (uint32_t i = 0; i < 4; ++i) {
+        // Same interpolators for the entire sprite.
+        for (uint32_t j = 0; j < key.interpolator_count; ++j) {
+          builder.createStore(point_interpolators[j], out_interpolators[j]);
+        }
+        // Top-left, bottom-left, top-right, bottom-right order (chosen
+        // arbitrarily, simply based on counterclockwise meaning front with
+        // frontFace = VkFrontFace(0), but faceness is ignored for non-polygon
+        // primitive types).
+        uint32_t point_vertex_x = i >> 1;
+        uint32_t point_vertex_y = i & 1;
+        // Point coordinates.
+        if (key.has_point_coordinates) {
+          id_vector_temp.clear();
+          id_vector_temp.reserve(2);
+          id_vector_temp.push_back(
+              builder.makeFloatConstant(float(point_vertex_x)));
+          id_vector_temp.push_back(
+              builder.makeFloatConstant(float(point_vertex_y)));
+          builder.createStore(
+              builder.makeCompositeConstant(type_float2, id_vector_temp),
+              out_point_coordinates);
+        }
+        // Position.
+        id_vector_temp.clear();
+        id_vector_temp.reserve(4);
+        id_vector_temp.push_back(point_edge_x[point_vertex_x]);
+        id_vector_temp.push_back(point_edge_y[point_vertex_y]);
+        id_vector_temp.push_back(point_z);
+        id_vector_temp.push_back(point_w);
+        spv::Id point_vertex_position =
+            builder.createCompositeConstruct(type_float4, id_vector_temp);
+        id_vector_temp.clear();
+        id_vector_temp.push_back(const_member_out_gl_per_vertex_position);
+        builder.createStore(
+            point_vertex_position,
+            builder.createAccessChain(spv::StorageClassOutput,
+                                      out_gl_per_vertex, id_vector_temp));
+        // Clip distances.
+        // TODO(Triang3l): Handle ps_ucp_mode properly, clip expanded points if
+        // needed.
+        if (clip_distance_count) {
+          id_vector_temp.clear();
+          id_vector_temp.push_back(
+              const_member_out_gl_per_vertex_clip_distance);
+          builder.createStore(
+              point_clip_distances,
+              builder.createAccessChain(spv::StorageClassOutput,
+                                        out_gl_per_vertex, id_vector_temp));
+        }
+        // Emit the vertex.
+        builder.createNoResultOp(spv::OpEmitVertex);
+      }
+      builder.createNoResultOp(spv::OpEndPrimitive);
+    } break;
+
     case PipelineGeometryShader::kRectangleList: {
       // Construct a strip with the fourth vertex generated by mirroring a
       // vertex across the longest edge (the diagonal).
@@ -1308,8 +1603,8 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) {
         id_vector_temp.reserve(2);
         id_vector_temp.push_back(const_float_0);
         id_vector_temp.push_back(const_float_0);
-        const_point_coordinates_zero = builder.makeCompositeConstant(
-            type_point_coordinates, id_vector_temp);
+        const_point_coordinates_zero =
+            builder.makeCompositeConstant(type_float2, id_vector_temp);
       }
 
       // Emit the triangle in the strip that consists of the original vertices.
@@ -1491,8 +1786,8 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) {
         id_vector_temp.reserve(2);
         id_vector_temp.push_back(const_float_0);
         id_vector_temp.push_back(const_float_0);
-        const_point_coordinates_zero = builder.makeCompositeConstant(
-            type_point_coordinates, id_vector_temp);
+        const_point_coordinates_zero =
+            builder.makeCompositeConstant(type_float2, id_vector_temp);
       }
 
       // Build the triangle strip from the original quad vertices in the
diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h
index e967a1415..6e0c73ab0 100644
--- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h
+++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h
@@ -92,6 +92,7 @@ class VulkanPipelineCache {
  private:
   enum class PipelineGeometryShader : uint32_t {
     kNone,
+    kPointList,
     kRectangleList,
     kQuadList,
   };
@@ -267,6 +268,7 @@ class VulkanPipelineCache {
   static bool GetGeometryShaderKey(
       PipelineGeometryShader geometry_shader_type,
       SpirvShaderTranslator::Modification vertex_shader_modification,
+      SpirvShaderTranslator::Modification pixel_shader_modification,
       GeometryShaderKey& key_out);
   VkShaderModule GetGeometryShader(GeometryShaderKey key);
 

From 8fb5da18eaa8f9d96d556fb67fd090e0fc055ab8 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Tue, 26 Jul 2022 16:24:14 +0300
Subject: [PATCH 2/5] [Vulkan] Add forgotten fullDrawIndexUint32 check

---
 src/xenia/gpu/vulkan/vulkan_primitive_processor.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc b/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc
index 058b6a5d1..b7f37f4b9 100644
--- a/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc
@@ -27,14 +27,13 @@ namespace vulkan {
 VulkanPrimitiveProcessor::~VulkanPrimitiveProcessor() { Shutdown(true); }
 
 bool VulkanPrimitiveProcessor::Initialize() {
-  // TODO(Triang3l): fullDrawIndexUint32 feature check and indirect index fetch.
   const ui::vulkan::VulkanProvider& provider =
       command_processor_.GetVulkanProvider();
   const VkPhysicalDeviceFeatures& device_features = provider.device_features();
   const VkPhysicalDevicePortabilitySubsetFeaturesKHR*
       device_portability_subset_features =
           provider.device_portability_subset_features();
-  if (!InitializeCommon(true,
+  if (!InitializeCommon(device_features.fullDrawIndexUint32,
                         !device_portability_subset_features ||
                             device_portability_subset_features->triangleFans,
                         false, device_features.geometryShader)) {

From 66c995f3aa26cb3e860144d9cf38883fadf6e4e3 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Tue, 26 Jul 2022 17:04:22 +0300
Subject: [PATCH 3/5] [SPIR-V] Saturate point sprite coordinates

---
 src/xenia/gpu/spirv_shader_translator.cc | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 452ac1450..199d0f99c 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -1619,8 +1619,17 @@ void SpirvShaderTranslator::StartFragmentShaderInMain() {
     spv::Id param_gen_z, param_gen_w;
     if (modification.pixel.param_gen_point) {
       assert_true(input_point_coordinates_ != spv::NoResult);
+      // Saturate to avoid negative point coordinates if the center of the pixel
+      // is not covered, and extrapolation is done.
+      id_vector_temp_.clear();
+      id_vector_temp_.reserve(3);
+      id_vector_temp_.push_back(
+          builder_->createLoad(input_point_coordinates_, spv::NoPrecision));
+      id_vector_temp_.push_back(const_float2_0_);
+      id_vector_temp_.push_back(const_float2_1_);
       spv::Id param_gen_point_coordinates =
-          builder_->createLoad(input_point_coordinates_, spv::NoPrecision);
+          builder_->createBuiltinCall(type_float2_, ext_inst_glsl_std_450_,
+                                      GLSLstd450NClamp, id_vector_temp_);
       param_gen_z = builder_->createCompositeExtract(
           param_gen_point_coordinates, type_float_, 0);
       param_gen_w = builder_->createCompositeExtract(

From ff7ef050632fb0c85760383f911bb9328e42176c Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Tue, 26 Jul 2022 17:08:12 +0300
Subject: [PATCH 4/5] [SPIR-V] Clamp cube face using NClamp, not NMax/FMin

---
 src/xenia/gpu/spirv_shader_translator_fetch.cc | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/xenia/gpu/spirv_shader_translator_fetch.cc b/src/xenia/gpu/spirv_shader_translator_fetch.cc
index 7be662460..88d3bd5ab 100644
--- a/src/xenia/gpu/spirv_shader_translator_fetch.cc
+++ b/src/xenia/gpu/spirv_shader_translator_fetch.cc
@@ -1296,18 +1296,14 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
           builder_->addDecoration(face, spv::DecorationNoContraction);
         }
         id_vector_temp_.clear();
-        id_vector_temp_.reserve(2);
+        id_vector_temp_.reserve(3);
+        id_vector_temp_.push_back(face);
         id_vector_temp_.push_back(const_float_0_);
-        id_vector_temp_.push_back(face);
-        face = builder_->createBuiltinCall(type_float_, ext_inst_glsl_std_450_,
-                                           GLSLstd450NMax, id_vector_temp_);
-        id_vector_temp_.clear();
-        id_vector_temp_.reserve(2);
         id_vector_temp_.push_back(builder_->makeFloatConstant(5.0f));
-        id_vector_temp_.push_back(face);
-        face = builder_->createBuiltinCall(type_float_, ext_inst_glsl_std_450_,
-                                           GLSLstd450FMin, id_vector_temp_);
-        face = builder_->createUnaryOp(spv::OpConvertFToU, type_uint_, face);
+        face = builder_->createUnaryOp(
+            spv::OpConvertFToU, type_uint_,
+            builder_->createBuiltinCall(type_float_, ext_inst_glsl_std_450_,
+                                        GLSLstd450NClamp, id_vector_temp_));
         // Split the face index into the axis and the sign.
         spv::Id const_uint_1 = builder_->makeUintConstant(1);
         spv::Id face_axis = builder_->createBinOp(

From 7595cdb52bd12d448aeabe4908862f59d283ce9d Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Wed, 27 Jul 2022 17:14:28 +0300
Subject: [PATCH 5/5] [Vulkan] Non-GS point sprites + minor SPIR-V fixes

---
 .../gpu/d3d12/d3d12_command_processor.cc      |   7 +-
 .../gpu/d3d12/d3d12_primitive_processor.cc    |  25 +-
 .../gpu/d3d12/d3d12_primitive_processor.h     |   5 +-
 src/xenia/gpu/dxbc_shader_translator.cc       |   4 -
 src/xenia/gpu/primitive_processor.cc          | 250 +++++++--
 src/xenia/gpu/primitive_processor.h           |  57 ++-
 src/xenia/gpu/spirv_shader_translator.cc      | 482 ++++++++++++++----
 src/xenia/gpu/spirv_shader_translator.h       |   9 +-
 .../gpu/vulkan/vulkan_command_processor.cc    |  90 ++--
 .../gpu/vulkan/vulkan_command_processor.h     |   4 +-
 src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc |  29 +-
 src/xenia/gpu/vulkan/vulkan_pipeline_cache.h  |   2 +-
 .../gpu/vulkan/vulkan_primitive_processor.cc  |  26 +-
 .../gpu/vulkan/vulkan_primitive_processor.h   |   5 +-
 14 files changed, 721 insertions(+), 274 deletions(-)

diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index b6f72ff9b..129f89fd0 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -2268,7 +2268,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
   UpdateSystemConstantValues(
       memexport_used, primitive_polygonal,
       primitive_processing_result.line_loop_closing_index,
-      primitive_processing_result.host_index_endian, viewport_info,
+      primitive_processing_result.host_shader_index_endian, viewport_info,
       used_texture_mask, normalized_depth_control, normalized_color_mask);
 
   // Update constant buffers, descriptors and root parameters.
@@ -2513,7 +2513,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     }
     ID3D12Resource* scratch_index_buffer = nullptr;
     switch (primitive_processing_result.index_buffer_type) {
-      case PrimitiveProcessor::ProcessedIndexBufferType::kGuest: {
+      case PrimitiveProcessor::ProcessedIndexBufferType::kGuestDMA: {
         if (memexport_used) {
           // If the shared memory is a UAV, it can't be used as an index buffer
           // (UAV is a read/write state, index buffer is a read-only state).
@@ -2545,7 +2545,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
             primitive_processor_->GetConvertedIndexBufferGpuAddress(
                 primitive_processing_result.host_index_buffer_handle);
         break;
-      case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltin:
+      case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForAuto:
+      case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForDMA:
         index_buffer_view.BufferLocation =
             primitive_processor_->GetBuiltinIndexBufferGpuAddress(
                 primitive_processing_result.host_index_buffer_handle);
diff --git a/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc b/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc
index a806546a1..03e67d9ac 100644
--- a/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc
@@ -28,7 +28,7 @@ namespace d3d12 {
 D3D12PrimitiveProcessor::~D3D12PrimitiveProcessor() { Shutdown(true); }
 
 bool D3D12PrimitiveProcessor::Initialize() {
-  if (!InitializeCommon(true, false, false, true)) {
+  if (!InitializeCommon(true, false, false, true, true, true)) {
     Shutdown();
     return false;
   }
@@ -83,9 +83,9 @@ void D3D12PrimitiveProcessor::EndFrame() {
   frame_index_buffers_.clear();
 }
 
-bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
-    uint32_t index_count, std::function<void(uint16_t*)> fill_callback) {
-  assert_not_zero(index_count);
+bool D3D12PrimitiveProcessor::InitializeBuiltinIndexBuffer(
+    size_t size_bytes, std::function<void(void*)> fill_callback) {
+  assert_not_zero(size_bytes);
   assert_null(builtin_index_buffer_);
   assert_null(builtin_index_buffer_upload_);
 
@@ -94,9 +94,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
   ID3D12Device* device = provider.GetDevice();
 
   D3D12_RESOURCE_DESC resource_desc;
-  ui::d3d12::util::FillBufferResourceDesc(
-      resource_desc, UINT64(sizeof(uint16_t) * index_count),
-      D3D12_RESOURCE_FLAG_NONE);
+  ui::d3d12::util::FillBufferResourceDesc(resource_desc, UINT64(size_bytes),
+                                          D3D12_RESOURCE_FLAG_NONE);
   Microsoft::WRL::ComPtr<ID3D12Resource> draw_resource;
   if (FAILED(device->CreateCommittedResource(
           &ui::d3d12::util::kHeapPropertiesDefault,
@@ -105,8 +104,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
           IID_PPV_ARGS(&draw_resource)))) {
     XELOGE(
         "D3D12 primitive processor: Failed to create the built-in index "
-        "buffer GPU resource with {} 16-bit indices",
-        index_count);
+        "buffer GPU resource with {} bytes",
+        size_bytes);
     return false;
   }
   Microsoft::WRL::ComPtr<ID3D12Resource> upload_resource;
@@ -117,8 +116,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
           IID_PPV_ARGS(&upload_resource)))) {
     XELOGE(
         "D3D12 primitive processor: Failed to create the built-in index "
-        "buffer upload resource with {} 16-bit indices",
-        index_count);
+        "buffer upload resource with {} bytes",
+        size_bytes);
     return false;
   }
 
@@ -127,8 +126,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
   if (FAILED(upload_resource->Map(0, &upload_read_range, &mapping))) {
     XELOGE(
         "D3D12 primitive processor: Failed to map the built-in index buffer "
-        "upload resource with {} 16-bit indices",
-        index_count);
+        "upload resource with {} bytes",
+        size_bytes);
     return false;
   }
   fill_callback(reinterpret_cast<uint16_t*>(mapping));
diff --git a/src/xenia/gpu/d3d12/d3d12_primitive_processor.h b/src/xenia/gpu/d3d12/d3d12_primitive_processor.h
index 81e1812a6..8ac02f4db 100644
--- a/src/xenia/gpu/d3d12/d3d12_primitive_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_primitive_processor.h
@@ -56,9 +56,8 @@ class D3D12PrimitiveProcessor final : public PrimitiveProcessor {
   }
 
  protected:
-  bool InitializeBuiltin16BitIndexBuffer(
-      uint32_t index_count,
-      std::function<void(uint16_t*)> fill_callback) override;
+  bool InitializeBuiltinIndexBuffer(
+      size_t size_bytes, std::function<void(void*)> fill_callback) override;
 
   void* RequestHostConvertedIndexBufferForCurrentFrame(
       xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,
diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc
index 602da9ce8..daa8cf782 100644
--- a/src/xenia/gpu/dxbc_shader_translator.cc
+++ b/src/xenia/gpu/dxbc_shader_translator.cc
@@ -964,8 +964,6 @@ void DxbcShaderTranslator::CompleteVertexOrDomainShader() {
 
   // Check if the shader returns XY/W rather than XY, and if it does, revert
   // that.
-  // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in
-  // affine interpolation.
   a_.OpAnd(temp_x_dest, flags_src, dxbc::Src::LU(kSysFlag_XYDividedByW));
   a_.OpIf(true, temp_x_src);
   a_.OpMul(dxbc::Dest::R(system_temp_position_, 0b0011),
@@ -974,8 +972,6 @@ void DxbcShaderTranslator::CompleteVertexOrDomainShader() {
   a_.OpEndIf();
 
   // Check if the shader returns Z/W rather than Z, and if it does, revert that.
-  // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in
-  // affine interpolation.
   a_.OpAnd(temp_x_dest, flags_src, dxbc::Src::LU(kSysFlag_ZDividedByW));
   a_.OpIf(true, temp_x_src);
   a_.OpMul(dxbc::Dest::R(system_temp_position_, 0b0100),
diff --git a/src/xenia/gpu/primitive_processor.cc b/src/xenia/gpu/primitive_processor.cc
index 68da6d100..827fb7b4e 100644
--- a/src/xenia/gpu/primitive_processor.cc
+++ b/src/xenia/gpu/primitive_processor.cc
@@ -9,6 +9,7 @@
 
 #include "xenia/gpu/primitive_processor.h"
 
+#include <algorithm>
 #include <cstring>
 #include <functional>
 #include <utility>
@@ -106,7 +107,9 @@ PrimitiveProcessor::~PrimitiveProcessor() { ShutdownCommon(); }
 
 bool PrimitiveProcessor::InitializeCommon(
     bool full_32bit_vertex_indices_supported, bool triangle_fans_supported,
-    bool line_loops_supported, bool quad_lists_supported) {
+    bool line_loops_supported, bool quad_lists_supported,
+    bool point_sprites_supported_without_vs_expansion,
+    bool rectangle_lists_supported_without_vs_expansion) {
   full_32bit_vertex_indices_used_ = full_32bit_vertex_indices_supported;
   convert_triangle_fans_to_lists_ =
       !triangle_fans_supported || cvars::force_convert_triangle_fans_to_lists;
@@ -115,33 +118,94 @@ bool PrimitiveProcessor::InitializeCommon(
   convert_quad_lists_to_triangle_lists_ =
       !quad_lists_supported ||
       cvars::force_convert_quad_lists_to_triangle_lists;
+  // No override cvars as hosts are not required to support the fallback paths
+  // since they require different vertex shader structure (for the fallback
+  // HostVertexShaderTypes).
+  expand_point_sprites_in_vs_ = !point_sprites_supported_without_vs_expansion;
+  expand_rectangle_lists_in_vs_ =
+      !rectangle_lists_supported_without_vs_expansion;
 
   // Initialize the index buffer for conversion of auto-indexed primitive types.
-  uint32_t builtin_index_count = 0;
+  size_t builtin_index_buffer_size = 0;
+  // 32-bit, before 16-bit due to alignment (for primitive expansion - when the
+  // indices encode not only the guest vertex index, but also a part needed for
+  // host expansion, thus may contain values above UINT16_MAX, such as up to
+  // (UINT16_MAX - 1) * 4 + 3 for point sprites).
+  // Using an index buffer for point sprite and rectangle list expansion instead
+  // of instancing as how instancing is implemented may vary wildly between
+  // GPUs, potentially slowly (like no different instances in the same
+  // wavefront) with small vertex counts per instance. Also using triangle
+  // strips with primitive restart, not triangle lists, so the vertex shader may
+  // be invoked once for the inner edge vertices, which is important for memory
+  // export in guest shaders, not to write to the same location from two
+  // invocations.
+  uint32_t builtin_ib_two_triangle_strip_count = 0;
+  if (expand_point_sprites_in_vs_) {
+    builtin_ib_two_triangle_strip_count =
+        std::max(uint32_t(UINT16_MAX), builtin_ib_two_triangle_strip_count);
+  }
+  if (expand_rectangle_lists_in_vs_) {
+    builtin_ib_two_triangle_strip_count =
+        std::max(uint32_t(UINT16_MAX / 3), builtin_ib_two_triangle_strip_count);
+  }
+  if (builtin_ib_two_triangle_strip_count) {
+    builtin_ib_offset_two_triangle_strips_ = builtin_index_buffer_size;
+    builtin_index_buffer_size +=
+        sizeof(uint32_t) *
+        GetTwoTriangleStripIndexCount(builtin_ib_two_triangle_strip_count);
+  } else {
+    builtin_ib_offset_two_triangle_strips_ = SIZE_MAX;
+  }
+  // 16-bit (for indirection on top of single auto-indexed vertices) - enough
+  // even if the backend has primitive reset enabled all the time (Metal) as
+  // auto-indexed draws are limited to UINT16_MAX vertices, not UINT16_MAX + 1.
   if (convert_triangle_fans_to_lists_) {
-    builtin_ib_offset_triangle_fans_to_lists_ =
-        sizeof(uint16_t) * builtin_index_count;
-    builtin_index_count += GetTriangleFanListIndexCount(UINT16_MAX);
+    builtin_ib_offset_triangle_fans_to_lists_ = builtin_index_buffer_size;
+    builtin_index_buffer_size +=
+        sizeof(uint16_t) * GetTriangleFanListIndexCount(UINT16_MAX);
   } else {
     builtin_ib_offset_triangle_fans_to_lists_ = SIZE_MAX;
   }
   if (convert_quad_lists_to_triangle_lists_) {
-    builtin_ib_offset_quad_lists_to_triangle_lists_ =
-        sizeof(uint16_t) * builtin_index_count;
-    builtin_index_count += GetQuadListTriangleListIndexCount(UINT16_MAX);
+    builtin_ib_offset_quad_lists_to_triangle_lists_ = builtin_index_buffer_size;
+    builtin_index_buffer_size +=
+        sizeof(uint16_t) * GetQuadListTriangleListIndexCount(UINT16_MAX);
   } else {
     builtin_ib_offset_quad_lists_to_triangle_lists_ = SIZE_MAX;
   }
-  if (builtin_index_count) {
-    if (!InitializeBuiltin16BitIndexBuffer(
-            builtin_index_count, [this](uint16_t* mapping) {
+  if (builtin_index_buffer_size) {
+    if (!InitializeBuiltinIndexBuffer(
+            builtin_index_buffer_size,
+            [this, builtin_ib_two_triangle_strip_count](void* mapping) {
+              uint32_t* mapping_32bit = reinterpret_cast<uint32_t*>(mapping);
+              if (builtin_ib_offset_two_triangle_strips_ != SIZE_MAX) {
+                // Two-triangle strips.
+                uint32_t* two_triangle_strip_ptr =
+                    mapping_32bit +
+                    builtin_ib_offset_two_triangle_strips_ / sizeof(uint32_t);
+                for (uint32_t i = 0; i < builtin_ib_two_triangle_strip_count;
+                     ++i) {
+                  if (i) {
+                    // Primitive restart.
+                    *(two_triangle_strip_ptr++) = UINT32_MAX;
+                  }
+                  // Host vertex index within the pair in the lower 2 bits,
+                  // guest primitive index in the rest.
+                  uint32_t two_triangle_strip_first_index = i << 2;
+                  for (uint32_t j = 0; j < 4; ++j) {
+                    *(two_triangle_strip_ptr++) =
+                        two_triangle_strip_first_index + j;
+                  }
+                }
+              }
+              uint16_t* mapping_16bit = reinterpret_cast<uint16_t*>(mapping);
               if (builtin_ib_offset_triangle_fans_to_lists_ != SIZE_MAX) {
                 // Triangle fans as triangle lists.
                 // Ordered as (v1, v2, v0), (v2, v3, v0) in Direct3D.
                 // https://docs.microsoft.com/en-us/windows/desktop/direct3d9/triangle-fans
                 uint16_t* triangle_list_ptr =
-                    mapping + builtin_ib_offset_triangle_fans_to_lists_ /
-                                  sizeof(uint16_t);
+                    mapping_16bit + builtin_ib_offset_triangle_fans_to_lists_ /
+                                        sizeof(uint16_t);
                 for (uint32_t i = 2; i < UINT16_MAX; ++i) {
                   *(triangle_list_ptr++) = uint16_t(i - 1);
                   *(triangle_list_ptr++) = uint16_t(i);
@@ -150,8 +214,9 @@ bool PrimitiveProcessor::InitializeCommon(
               }
               if (builtin_ib_offset_quad_lists_to_triangle_lists_ != SIZE_MAX) {
                 uint16_t* triangle_list_ptr =
-                    mapping + builtin_ib_offset_quad_lists_to_triangle_lists_ /
-                                  sizeof(uint16_t);
+                    mapping_16bit +
+                    builtin_ib_offset_quad_lists_to_triangle_lists_ /
+                        sizeof(uint16_t);
                 // TODO(Triang3l): SIMD for faster initialization?
                 for (uint32_t i = 0; i < UINT16_MAX / 4; ++i) {
                   uint16_t quad_first_index = uint16_t(i * 4);
@@ -309,15 +374,27 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
       return false;
     }
   } else {
+    host_vertex_shader_type = Shader::HostVertexShaderType::kVertex;
     switch (guest_primitive_type) {
       case xenos::PrimitiveType::kPointList:
+        if (expand_point_sprites_in_vs_) {
+          host_primitive_type = xenos::PrimitiveType::kTriangleStrip;
+          host_vertex_shader_type =
+              Shader::HostVertexShaderType::kPointListAsTriangleStrip;
+        }
+        break;
       case xenos::PrimitiveType::kLineList:
       case xenos::PrimitiveType::kLineStrip:
       case xenos::PrimitiveType::kTriangleList:
       case xenos::PrimitiveType::kTriangleStrip:
+        // Supported natively on all backends.
+        break;
       case xenos::PrimitiveType::kRectangleList:
-        // Supported natively or through geometry or compute shaders on all
-        // backends.
+        if (expand_rectangle_lists_in_vs_) {
+          host_primitive_type = xenos::PrimitiveType::kTriangleStrip;
+          host_vertex_shader_type =
+              Shader::HostVertexShaderType::kRectangleListAsTriangleStrip;
+        }
         break;
       case xenos::PrimitiveType::kTriangleFan:
         if (convert_triangle_fans_to_lists_) {
@@ -342,7 +419,6 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
         assert_always();
         return false;
     }
-    host_vertex_shader_type = Shader::HostVertexShaderType::kVertex;
   }
 
   // Process the indices.
@@ -359,12 +435,86 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
     guest_draw_vertex_count = vgt_dma_size.num_words;
   }
   uint32_t line_loop_closing_index = 0;
-  uint32_t guest_index_base;
+  uint32_t guest_index_base = 0, guest_index_buffer_needed_bytes = 0;
   CachedResult cacheable;
   cacheable.host_draw_vertex_count = guest_draw_vertex_count;
   cacheable.host_primitive_reset_enabled = false;
   cacheable.host_index_buffer_handle = SIZE_MAX;
-  if (vgt_draw_initiator.source_select == xenos::SourceSelect::kAutoIndex) {
+  if (host_vertex_shader_type ==
+          Shader::HostVertexShaderType::kPointListAsTriangleStrip ||
+      host_vertex_shader_type ==
+          Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) {
+    // As two-triangle strips, with guest indices being either autogenerated or
+    // fetched via DMA.
+    uint32_t primitive_count = guest_draw_vertex_count;
+    if (host_vertex_shader_type ==
+        Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) {
+      primitive_count /= 3;
+    }
+    cacheable.host_draw_vertex_count =
+        GetTwoTriangleStripIndexCount(primitive_count);
+    cacheable.host_index_format = xenos::IndexFormat::kInt32;
+    cacheable.host_primitive_reset_enabled = true;
+    assert_true(builtin_ib_offset_two_triangle_strips_ != SIZE_MAX);
+    cacheable.host_index_buffer_handle = builtin_ib_offset_two_triangle_strips_;
+    if (vgt_draw_initiator.source_select == xenos::SourceSelect::kAutoIndex) {
+      cacheable.index_buffer_type =
+          ProcessedIndexBufferType::kHostBuiltinForAuto;
+      cacheable.host_shader_index_endian = xenos::Endian::kNone;
+    } else {
+      // There is an index buffer.
+      assert_true(vgt_draw_initiator.source_select ==
+                  xenos::SourceSelect::kDMA);
+      if (vgt_draw_initiator.source_select != xenos::SourceSelect::kDMA) {
+        // TODO(Triang3l): Support immediate-indexed vertices.
+        XELOGE(
+            "Primitive processor: Unsupported vertex index source {}. Report "
+            "the game to Xenia developers!",
+            uint32_t(vgt_draw_initiator.source_select));
+        return false;
+      }
+      xenos::IndexFormat guest_index_format = vgt_draw_initiator.index_size;
+      // Normalize the endian.
+      cacheable.index_buffer_type =
+          ProcessedIndexBufferType::kHostBuiltinForDMA;
+      xenos::Endian guest_index_endian = vgt_dma_size.swap_mode;
+      if (guest_index_format == xenos::IndexFormat::kInt16 &&
+          (guest_index_endian != xenos::Endian::kNone &&
+           guest_index_endian != xenos::Endian::k8in16)) {
+        XELOGW(
+            "Primitive processor: 32-bit endian swap mode {} is used for "
+            "16-bit indices. This shouldn't normally be happening, but report "
+            "the game to Xenia developers for investigation of the intended "
+            "behavior (ignore or actually swap across adjacent indices)! "
+            "Currently disabling the swap for 16-and-32 and replacing 8-in-32 "
+            "with 8-in-16.",
+            uint32_t(guest_index_endian));
+        guest_index_endian = guest_index_endian == xenos::Endian::k8in32
+                                 ? xenos::Endian::k8in16
+                                 : xenos::Endian::kNone;
+      }
+      cacheable.host_shader_index_endian = guest_index_endian;
+      // Get the index buffer memory range.
+      uint32_t index_size_log2 =
+          guest_index_format == xenos::IndexFormat::kInt16 ? 1 : 2;
+      // The base should already be aligned, but aligning here too for safety.
+      guest_index_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32 &
+                         ~uint32_t((1 << index_size_log2) - 1);
+      guest_index_buffer_needed_bytes = guest_draw_vertex_count
+                                        << index_size_log2;
+      if (guest_index_base > SharedMemory::kBufferSize ||
+          SharedMemory::kBufferSize - guest_index_base <
+              guest_index_buffer_needed_bytes) {
+        XELOGE(
+            "Primitive processor: Index buffer at 0x{:08X}, 0x{:X} bytes "
+            "required, is out of the physical memory bounds",
+            guest_index_base, guest_index_buffer_needed_bytes);
+        assert_always();
+        return false;
+      }
+    }
+  } else if (vgt_draw_initiator.source_select ==
+             xenos::SourceSelect::kAutoIndex) {
     // Auto-indexed - use a remapping index buffer if needed to change the
     // primitive type.
     if (tessellation_enabled &&
@@ -376,9 +526,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
       assert_always();
       return false;
     }
-    guest_index_base = 0;
     cacheable.host_index_format = xenos::IndexFormat::kInt16;
-    cacheable.host_index_endian = xenos::Endian::kNone;
+    cacheable.host_shader_index_endian = xenos::Endian::kNone;
     cacheable.host_primitive_reset_enabled = false;
     cacheable.index_buffer_type = ProcessedIndexBufferType::kNone;
     if (host_primitive_type != guest_primitive_type) {
@@ -388,7 +537,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
                       xenos::PrimitiveType::kTriangleList);
           cacheable.host_draw_vertex_count =
               GetTriangleFanListIndexCount(cacheable.host_draw_vertex_count);
-          cacheable.index_buffer_type = ProcessedIndexBufferType::kHostBuiltin;
+          cacheable.index_buffer_type =
+              ProcessedIndexBufferType::kHostBuiltinForAuto;
           assert_true(builtin_ib_offset_triangle_fans_to_lists_ != SIZE_MAX);
           cacheable.host_index_buffer_handle =
               builtin_ib_offset_triangle_fans_to_lists_;
@@ -409,7 +559,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
                       xenos::PrimitiveType::kTriangleList);
           cacheable.host_draw_vertex_count = GetQuadListTriangleListIndexCount(
               cacheable.host_draw_vertex_count);
-          cacheable.index_buffer_type = ProcessedIndexBufferType::kHostBuiltin;
+          cacheable.index_buffer_type =
+              ProcessedIndexBufferType::kHostBuiltinForAuto;
           assert_true(builtin_ib_offset_quad_lists_to_triangle_lists_ !=
                       SIZE_MAX);
           cacheable.host_index_buffer_handle =
@@ -503,8 +654,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
     // The base should already be aligned, but aligning here too for safety.
     guest_index_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32 &
                        ~uint32_t((1 << index_size_log2) - 1);
-    uint32_t guest_index_buffer_needed_bytes = guest_draw_vertex_count
-                                               << index_size_log2;
+    guest_index_buffer_needed_bytes = guest_draw_vertex_count
+                                      << index_size_log2;
     if (guest_index_base > SharedMemory::kBufferSize ||
         SharedMemory::kBufferSize - guest_index_base <
             guest_index_buffer_needed_bytes) {
@@ -517,7 +668,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
     }
 
     cacheable.host_index_format = guest_index_format;
-    cacheable.host_index_endian = guest_index_endian;
+    cacheable.host_shader_index_endian = guest_index_endian;
     uint32_t guest_index_mask_guest_endian =
         guest_index_format == xenos::IndexFormat::kInt16
             ? UINT16_MAX
@@ -666,7 +817,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
                 assert_unhandled_case(guest_index_endian);
                 return false;
             }
-            cacheable.host_index_endian = xenos::Endian::kNone;
+            cacheable.host_shader_index_endian = xenos::Endian::kNone;
           }
         }
         cache_transaction.SetNewResult(cacheable);
@@ -677,7 +828,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
       // endian-swap, or even to safely drop the upper 8 bits if no swap is even
       // needed) indirectly.
       cacheable.host_draw_vertex_count = guest_draw_vertex_count;
-      cacheable.index_buffer_type = ProcessedIndexBufferType::kGuest;
+      cacheable.index_buffer_type = ProcessedIndexBufferType::kGuestDMA;
       cacheable.host_primitive_reset_enabled = guest_primitive_reset_enabled;
       if (guest_primitive_reset_enabled) {
         if (guest_index_format == xenos::IndexFormat::kInt16) {
@@ -742,8 +893,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
         } else {
           // Low 24 bits of the guest index are compared to the primitive reset
           // index. If the backend doesn't support full 32-bit indices, for
-          // ProcessedIndexBufferType::kGuest, the host needs to read the buffer
-          // indirectly in the vertex shaders and swap, and for
+          // ProcessedIndexBufferType::kGuestDMA, the host needs to read the
+          // buffer indirectly in the vertex shaders and swap, and for
           // ProcessedIndexBufferType::kHostConverted (if primitive reset is
           // actually used, thus exactly 0xFFFFFFFF must be sent to the host for
           // it in a true index buffer), no indirection is done, but
@@ -800,26 +951,31 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
                 assert_unhandled_case(guest_index_endian);
                 return false;
               }
-              cacheable.host_index_endian = full_32bit_vertex_indices_used_
-                                                ? guest_index_endian
-                                                : xenos::Endian::kNone;
+              cacheable.host_shader_index_endian =
+                  full_32bit_vertex_indices_used_ ? guest_index_endian
+                                                  : xenos::Endian::kNone;
             }
             cache_transaction.SetNewResult(cacheable);
           }
         }
       }
-      if (cacheable.index_buffer_type == ProcessedIndexBufferType::kGuest) {
-        // Request the index buffer memory.
-        // TODO(Triang3l): Shared memory request cache.
-        if (!shared_memory_.RequestRange(guest_index_base,
-                                         guest_index_buffer_needed_bytes)) {
-          XELOGE(
-              "PrimitiveProcessor: Failed to request index buffer 0x{:08X}, "
-              "0x{:X} bytes needed, in the shared memory",
-              guest_index_base, guest_index_buffer_needed_bytes);
-          return false;
-        }
-      }
+    }
+  }
+
+  // Request the indices in the shared memory if they need to be accessed from
+  // there on the GPU.
+  if (cacheable.index_buffer_type == ProcessedIndexBufferType::kGuestDMA ||
+      cacheable.index_buffer_type ==
+          ProcessedIndexBufferType::kHostBuiltinForDMA) {
+    // Request the index buffer memory.
+    // TODO(Triang3l): Shared memory request cache.
+    if (!shared_memory_.RequestRange(guest_index_base,
+                                     guest_index_buffer_needed_bytes)) {
+      XELOGE(
+          "PrimitiveProcessor: Failed to request index buffer 0x{:08X}, 0x{:X} "
+          "bytes needed, in the shared memory",
+          guest_index_base, guest_index_buffer_needed_bytes);
+      return false;
     }
   }
 
@@ -832,7 +988,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
   result_out.index_buffer_type = cacheable.index_buffer_type;
   result_out.guest_index_base = guest_index_base;
   result_out.host_index_format = cacheable.host_index_format;
-  result_out.host_index_endian = cacheable.host_index_endian;
+  result_out.host_shader_index_endian = cacheable.host_shader_index_endian;
   result_out.host_primitive_reset_enabled =
       cacheable.host_primitive_reset_enabled;
   result_out.host_index_buffer_handle = cacheable.host_index_buffer_handle;
diff --git a/src/xenia/gpu/primitive_processor.h b/src/xenia/gpu/primitive_processor.h
index cfbec0ae9..6a77a3d0f 100644
--- a/src/xenia/gpu/primitive_processor.h
+++ b/src/xenia/gpu/primitive_processor.h
@@ -10,6 +10,7 @@
 #ifndef XENIA_GPU_PRIMITIVE_PROCESSOR_H_
 #define XENIA_GPU_PRIMITIVE_PROCESSOR_H_
 
+#include <algorithm>
 #include <climits>
 #include <cstddef>
 #include <cstdint>
@@ -110,13 +111,16 @@ class PrimitiveProcessor {
     // For 32-bit, indirection is needed if the host only supports 24-bit
     // indices (even for non-endian-swapped, as the GPU should be ignoring the
     // upper 8 bits completely, rather than exhibiting undefined behavior.
-    kGuest,
+    kGuestDMA,
     // Converted and stored in the primitive converter for the current draw
     // command. For 32-bit indices, if the host doesn't support all 32 bits,
     // this kind of an index buffer will always be pre-masked and pre-swapped.
     kHostConverted,
     // Auto-indexed on the guest, but with an adapter index buffer on the host.
-    kHostBuiltin,
+    kHostBuiltinForAuto,
+    // Adapter index buffer on the host for indirect loading of indices via DMA
+    // (from the shared memory).
+    kHostBuiltinForDMA,
   };
 
   struct ProcessingResult {
@@ -136,13 +140,14 @@ class PrimitiveProcessor {
     ProcessedIndexBufferType index_buffer_type;
     uint32_t guest_index_base;
     xenos::IndexFormat host_index_format;
-    xenos::Endian host_index_endian;
+    xenos::Endian host_shader_index_endian;
     // The reset index, if enabled, is always 0xFFFF for host_index_format
     // kInt16 and 0xFFFFFFFF for kInt32. Never enabled for "list" primitive
     // types, thus safe for direct usage on Vulkan.
     bool host_primitive_reset_enabled;
     // Backend-specific handle for the index buffer valid for the current draw,
-    // only valid for index_buffer_type kHostConverted and kHostBuiltin.
+    // only valid for index_buffer_type kHostConverted, kHostBuiltinForAuto and
+    // kHostBuiltinForDMA.
     size_t host_index_buffer_handle;
     bool IsTessellated() const {
       return Shader::IsHostVertexShaderTypeDomain(host_vertex_shader_type);
@@ -165,6 +170,12 @@ class PrimitiveProcessor {
   bool IsConvertingQuadListsToTriangleLists() const {
     return convert_quad_lists_to_triangle_lists_;
   }
+  bool IsExpandingPointSpritesInVS() const {
+    return expand_point_sprites_in_vs_;
+  }
+  bool IsExpandingRectangleListsInVS() const {
+    return expand_rectangle_lists_in_vs_;
+  }
 
   // Submission must be open to call (may request the index buffer in the shared
   // memory).
@@ -217,8 +228,8 @@ class PrimitiveProcessor {
   //       if indirection may be needed.
   //     - When full 32-bit indices are not supported, the host must be using
   //       auto-indexed draws for 32-bit indices of ProcessedIndexBufferType
-  //       kGuest, while fetching the index data manually from the shared memory
-  //       buffer and endian-swapping it.
+  //       kGuestDMA, while fetching the index data manually from the shared
+  //       memory buffer and endian-swapping it.
   //     - Indirection, however, precludes primitive reset usage - so if
   //       primitive reset is needed, the primitive processor will pre-swap and
   //       pre-mask the index buffer so there are only host-endian 0x00###### or
@@ -235,19 +246,26 @@ class PrimitiveProcessor {
   //     those guest primitive types directly or through geometry shader
   //     emulation. Debug overriding will be resolved in the common code if
   //     needed.
+  // - point_sprites_supported_without_vs_expansion,
+  //   rectangle_lists_supported_without_vs_expansion:
+  //   - Pass true or false depending on whether the host actually supports
+  //     those guest primitive types directly or through geometry shader
+  //     emulation. Overrides do not apply to these as hosts are not required to
+  //     support the fallback paths since they require different vertex shader
+  //     structure (for the fallback HostVertexShaderTypes).
   bool InitializeCommon(bool full_32bit_vertex_indices_supported,
                         bool triangle_fans_supported, bool line_loops_supported,
-                        bool quad_lists_supported);
+                        bool quad_lists_supported,
+                        bool point_sprites_supported_without_vs_expansion,
+                        bool rectangle_lists_supported_without_vs_expansion);
   // If any primitive type conversion is needed for auto-indexed draws, called
   // from InitializeCommon (thus only once in the primitive processor's
   // lifetime) to set up the backend's index buffer containing indices for
-  // primitive type remapping. The backend must allocate a `sizeof(uint16_t) *
-  // index_count` buffer and call fill_callback for its mapping if creation is
-  // successful. 16-bit indices are enough even if the backend has primitive
-  // reset enabled all the time (Metal) as auto-indexed draws are limited to
-  // UINT16_MAX vertices, not UINT16_MAX + 1.
-  virtual bool InitializeBuiltin16BitIndexBuffer(
-      uint32_t index_count, std::function<void(uint16_t*)> fill_callback) = 0;
+  // primitive type remapping. The backend must allocate a 4-byte-aligned buffer
+  // with `size_bytes` and call fill_callback for its mapping if creation has
+  // been successful.
+  virtual bool InitializeBuiltinIndexBuffer(
+      size_t size_bytes, std::function<void(void*)> fill_callback) = 0;
   // Call last in implementation-specific shutdown, also callable from the
   // destructor.
   void ShutdownCommon();
@@ -509,6 +527,12 @@ class PrimitiveProcessor {
     }
   };
 
+  static constexpr uint32_t GetTwoTriangleStripIndexCount(
+      uint32_t strip_count) {
+    // 4 vertices per strip, and primitive restarts between strips.
+    return 4 * strip_count + (std::max(strip_count, UINT32_C(1)) - 1);
+  }
+
   // Triangle fan test cases:
   // - 4D5307E6 - main menu - game logo, developer logo, backgrounds of the menu
   //   item list (the whole menu and individual items) - no index buffer.
@@ -675,8 +699,11 @@ class PrimitiveProcessor {
   bool convert_triangle_fans_to_lists_ = false;
   bool convert_line_loops_to_strips_ = false;
   bool convert_quad_lists_to_triangle_lists_ = false;
+  bool expand_point_sprites_in_vs_ = false;
+  bool expand_rectangle_lists_in_vs_ = false;
 
   // Byte offsets used, for simplicity, directly as handles.
+  size_t builtin_ib_offset_two_triangle_strips_ = SIZE_MAX;
   size_t builtin_ib_offset_triangle_fans_to_lists_ = SIZE_MAX;
   size_t builtin_ib_offset_quad_lists_to_triangle_lists_ = SIZE_MAX;
 
@@ -745,7 +772,7 @@ class PrimitiveProcessor {
     uint32_t host_draw_vertex_count;
     ProcessedIndexBufferType index_buffer_type;
     xenos::IndexFormat host_index_format;
-    xenos::Endian host_index_endian;
+    xenos::Endian host_shader_index_endian;
     bool host_primitive_reset_enabled;
     size_t host_index_buffer_handle;
   };
diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 199d0f99c..bb89e0d41 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -111,6 +111,7 @@ void SpirvShaderTranslator::Reset() {
   input_front_facing_ = spv::NoResult;
   std::fill(input_output_interpolators_.begin(),
             input_output_interpolators_.end(), spv::NoResult);
+  output_point_coordinates_ = spv::NoResult;
   output_point_size_ = spv::NoResult;
 
   sampler_bindings_.clear();
@@ -1097,18 +1098,33 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderBeforeMain() {
 
   Modification shader_modification = GetSpirvShaderModification();
 
-  // Create the point size output. Not using gl_PointSize from gl_PerVertex not
-  // to rely on the shaderTessellationAndGeometryPointSize feature, and also
-  // because the value written to gl_PointSize must be greater than zero.
-  if (shader_modification.vertex.output_point_size) {
-    output_point_size_ =
-        builder_->createVariable(spv::NoPrecision, spv::StorageClassOutput,
-                                 type_float_, "xe_out_point_size");
-    builder_->addDecoration(output_point_size_, spv::DecorationLocation,
-                            int(output_location));
-    builder_->addDecoration(output_point_size_, spv::DecorationInvariant);
-    main_interface_.push_back(output_point_size_);
-    ++output_location;
+  if (shader_modification.vertex.output_point_parameters) {
+    if (shader_modification.vertex.host_vertex_shader_type ==
+        Shader::HostVertexShaderType::kPointListAsTriangleStrip) {
+      // Create the point coordinates output.
+      output_point_coordinates_ =
+          builder_->createVariable(spv::NoPrecision, spv::StorageClassOutput,
+                                   type_float2_, "xe_out_point_coordinates");
+      builder_->addDecoration(output_point_coordinates_,
+                              spv::DecorationLocation, int(output_location));
+      builder_->addDecoration(output_point_coordinates_,
+                              spv::DecorationInvariant);
+      main_interface_.push_back(output_point_coordinates_);
+      ++output_location;
+    } else {
+      // Create the point size output. Not using gl_PointSize from gl_PerVertex
+      // not to rely on the shaderTessellationAndGeometryPointSize feature, and
+      // also because the value written to gl_PointSize must be greater than
+      // zero.
+      output_point_size_ =
+          builder_->createVariable(spv::NoPrecision, spv::StorageClassOutput,
+                                   type_float_, "xe_out_point_size");
+      builder_->addDecoration(output_point_size_, spv::DecorationLocation,
+                              int(output_location));
+      builder_->addDecoration(output_point_size_, spv::DecorationInvariant);
+      main_interface_.push_back(output_point_size_);
+      ++output_location;
+    }
   }
 
   // Create the gl_PerVertex output for used system outputs.
@@ -1172,24 +1188,33 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() {
     }
   }
 
+  Modification shader_modification = GetSpirvShaderModification();
+
+  // TODO(Triang3l): For HostVertexShaderType::kRectangeListAsTriangleStrip,
+  // start the vertex loop, and load the index there.
+
   // Load the vertex index or the tessellation parameters.
   if (register_count()) {
     // TODO(Triang3l): Barycentric coordinates and patch index.
     if (IsSpirvVertexShader()) {
-      // TODO(Triang3l): Close line loop primitive.
-      // Load the unswapped index as uint for swapping, or for indirect loading
-      // if needed.
       spv::Id vertex_index = builder_->createUnaryOp(
           spv::OpBitcast, type_uint_,
           builder_->createLoad(input_vertex_index_, spv::NoPrecision));
-      if (!features_.full_draw_index_uint32) {
-        // Check if the full 32-bit index needs to be loaded indirectly.
+      if (shader_modification.vertex.host_vertex_shader_type ==
+          Shader::HostVertexShaderType::kPointListAsTriangleStrip) {
+        // Load the point index, autogenerated or indirectly from the index
+        // buffer.
+        // Extract the primitive index from the two-triangle strip vertex index.
+        spv::Id const_uint_2 = builder_->makeUintConstant(2);
+        vertex_index = builder_->createBinOp(
+            spv::OpShiftRightLogical, type_uint_, vertex_index, const_uint_2);
+        // Check if the index needs to be loaded from the index buffer.
         spv::Id load_vertex_index = builder_->createBinOp(
             spv::OpINotEqual, type_bool_,
             builder_->createBinOp(
                 spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_,
-                builder_->makeUintConstant(
-                    static_cast<unsigned int>(kSysFlag_VertexIndexLoad))),
+                builder_->makeUintConstant(static_cast<unsigned int>(
+                    kSysFlag_ComputeOrPrimitiveVertexIndexLoad))),
             const_uint_0_);
         spv::Block& block_load_vertex_index_pre = *builder_->getBuildPoint();
         spv::Block& block_load_vertex_index_start = builder_->makeNewBlock();
@@ -1200,25 +1225,61 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() {
                                           &block_load_vertex_index_start,
                                           &block_load_vertex_index_merge);
         builder_->setBuildPoint(&block_load_vertex_index_start);
-        // Load the 32-bit index.
-        // TODO(Triang3l): Bounds checking.
+        // Check if the index is 32-bit.
+        spv::Id vertex_index_is_32bit = builder_->createBinOp(
+            spv::OpINotEqual, type_bool_,
+            builder_->createBinOp(
+                spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_,
+                builder_->makeUintConstant(static_cast<unsigned int>(
+                    kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit))),
+            const_uint_0_);
+        // Calculate the vertex index address in the shared memory.
         id_vector_temp_.clear();
         id_vector_temp_.push_back(
             builder_->makeIntConstant(kSystemConstantVertexIndexLoadAddress));
+        spv::Id vertex_index_address = builder_->createBinOp(
+            spv::OpIAdd, type_uint_,
+            builder_->createLoad(
+                builder_->createAccessChain(spv::StorageClassUniform,
+                                            uniform_system_constants_,
+                                            id_vector_temp_),
+                spv::NoPrecision),
+            builder_->createBinOp(
+                spv::OpShiftLeftLogical, type_uint_, vertex_index,
+                builder_->createTriOp(spv::OpSelect, type_uint_,
+                                      vertex_index_is_32bit, const_uint_2,
+                                      builder_->makeUintConstant(1))));
+        // Load the 32 bits containing the whole vertex index or two 16-bit
+        // vertex indices.
+        // TODO(Triang3l): Bounds checking.
         spv::Id loaded_vertex_index =
             LoadUint32FromSharedMemory(builder_->createUnaryOp(
                 spv::OpBitcast, type_int_,
+                builder_->createBinOp(spv::OpShiftRightLogical, type_uint_,
+                                      vertex_index_address, const_uint_2)));
+        // Extract the 16-bit index from the loaded 32 bits if needed.
+        loaded_vertex_index = builder_->createTriOp(
+            spv::OpSelect, type_uint_, vertex_index_is_32bit,
+            loaded_vertex_index,
+            builder_->createTriOp(
+                spv::OpBitFieldUExtract, type_uint_, loaded_vertex_index,
                 builder_->createBinOp(
-                    spv::OpIAdd, type_uint_,
-                    builder_->createBinOp(
-                        spv::OpShiftRightLogical, type_uint_,
-                        builder_->createLoad(
-                            builder_->createAccessChain(
-                                spv::StorageClassUniform,
-                                uniform_system_constants_, id_vector_temp_),
-                            spv::NoPrecision),
-                        builder_->makeUintConstant(2)),
-                    vertex_index)));
+                    spv::OpShiftLeftLogical, type_uint_,
+                    builder_->createBinOp(spv::OpBitwiseAnd, type_uint_,
+                                          vertex_index_address, const_uint_2),
+                    builder_->makeUintConstant(4 - 1)),
+                builder_->makeUintConstant(16)));
+        // Endian-swap the loaded index.
+        id_vector_temp_.clear();
+        id_vector_temp_.push_back(
+            builder_->makeIntConstant(kSystemConstantVertexIndexEndian));
+        loaded_vertex_index = EndianSwap32Uint(
+            loaded_vertex_index,
+            builder_->createLoad(
+                builder_->createAccessChain(spv::StorageClassUniform,
+                                            uniform_system_constants_,
+                                            id_vector_temp_),
+                spv::NoPrecision));
         // Get the actual build point for phi.
         spv::Block& block_load_vertex_index_end = *builder_->getBuildPoint();
         builder_->createBranch(&block_load_vertex_index_merge);
@@ -1238,19 +1299,81 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() {
           builder_->getBuildPoint()->addInstruction(
               std::move(loaded_vertex_index_phi_op));
         }
+      } else {
+        // TODO(Triang3l): Close line loop primitive.
+        // Load the unswapped index as uint for swapping, or for indirect
+        // loading if needed.
+        if (!features_.full_draw_index_uint32) {
+          // Check if the full 32-bit index needs to be loaded indirectly.
+          spv::Id load_vertex_index = builder_->createBinOp(
+              spv::OpINotEqual, type_bool_,
+              builder_->createBinOp(
+                  spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_,
+                  builder_->makeUintConstant(
+                      static_cast<unsigned int>(kSysFlag_VertexIndexLoad))),
+              const_uint_0_);
+          spv::Block& block_load_vertex_index_pre = *builder_->getBuildPoint();
+          spv::Block& block_load_vertex_index_start = builder_->makeNewBlock();
+          spv::Block& block_load_vertex_index_merge = builder_->makeNewBlock();
+          SpirvCreateSelectionMerge(block_load_vertex_index_merge.getId(),
+                                    spv::SelectionControlDontFlattenMask);
+          builder_->createConditionalBranch(load_vertex_index,
+                                            &block_load_vertex_index_start,
+                                            &block_load_vertex_index_merge);
+          builder_->setBuildPoint(&block_load_vertex_index_start);
+          // Load the 32-bit index.
+          // TODO(Triang3l): Bounds checking.
+          id_vector_temp_.clear();
+          id_vector_temp_.push_back(
+              builder_->makeIntConstant(kSystemConstantVertexIndexLoadAddress));
+          spv::Id loaded_vertex_index =
+              LoadUint32FromSharedMemory(builder_->createUnaryOp(
+                  spv::OpBitcast, type_int_,
+                  builder_->createBinOp(
+                      spv::OpIAdd, type_uint_,
+                      builder_->createBinOp(
+                          spv::OpShiftRightLogical, type_uint_,
+                          builder_->createLoad(
+                              builder_->createAccessChain(
+                                  spv::StorageClassUniform,
+                                  uniform_system_constants_, id_vector_temp_),
+                              spv::NoPrecision),
+                          builder_->makeUintConstant(2)),
+                      vertex_index)));
+          // Get the actual build point for phi.
+          spv::Block& block_load_vertex_index_end = *builder_->getBuildPoint();
+          builder_->createBranch(&block_load_vertex_index_merge);
+          // Select between the loaded index and the original index from Vulkan.
+          builder_->setBuildPoint(&block_load_vertex_index_merge);
+          {
+            std::unique_ptr<spv::Instruction> loaded_vertex_index_phi_op =
+                std::make_unique<spv::Instruction>(builder_->getUniqueId(),
+                                                   type_uint_, spv::OpPhi);
+            loaded_vertex_index_phi_op->addIdOperand(loaded_vertex_index);
+            loaded_vertex_index_phi_op->addIdOperand(
+                block_load_vertex_index_end.getId());
+            loaded_vertex_index_phi_op->addIdOperand(vertex_index);
+            loaded_vertex_index_phi_op->addIdOperand(
+                block_load_vertex_index_pre.getId());
+            vertex_index = loaded_vertex_index_phi_op->getResultId();
+            builder_->getBuildPoint()->addInstruction(
+                std::move(loaded_vertex_index_phi_op));
+          }
+        }
+        // Endian-swap the index.
+        id_vector_temp_.clear();
+        id_vector_temp_.push_back(
+            builder_->makeIntConstant(kSystemConstantVertexIndexEndian));
+        vertex_index = EndianSwap32Uint(
+            vertex_index, builder_->createLoad(
+                              builder_->createAccessChain(
+                                  spv::StorageClassUniform,
+                                  uniform_system_constants_, id_vector_temp_),
+                              spv::NoPrecision));
       }
-      // Endian-swap the index and convert to int.
-      id_vector_temp_.clear();
-      id_vector_temp_.push_back(
-          builder_->makeIntConstant(kSystemConstantVertexIndexEndian));
-      spv::Id vertex_index_endian =
-          builder_->createLoad(builder_->createAccessChain(
-                                   spv::StorageClassUniform,
-                                   uniform_system_constants_, id_vector_temp_),
-                               spv::NoPrecision);
-      vertex_index = builder_->createUnaryOp(
-          spv::OpBitcast, type_int_,
-          EndianSwap32Uint(vertex_index, vertex_index_endian));
+      // Convert the index to a signed integer.
+      vertex_index =
+          builder_->createUnaryOp(spv::OpBitcast, type_int_, vertex_index);
       // Add the base to the index.
       id_vector_temp_.clear();
       id_vector_temp_.push_back(
@@ -1301,61 +1424,66 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() {
       builder_->createTriOp(spv::OpSelect, type_float_, is_w_not_reciprocal,
                             position_w, guest_position_w_inv);
 
-  // Check if the shader returns XY/W rather than XY, and if it does, revert
-  // that.
-  // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in
-  // affine interpolation.
-  uint_vector_temp_.clear();
-  uint_vector_temp_.reserve(2);
-  uint_vector_temp_.push_back(0);
-  uint_vector_temp_.push_back(1);
-  spv::Id position_xy = builder_->createRvalueSwizzle(
-      spv::NoPrecision, type_float2_, guest_position, uint_vector_temp_);
-  spv::Id is_xy_divided_by_w = builder_->createBinOp(
-      spv::OpINotEqual, type_bool_,
-      builder_->createBinOp(
-          spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_,
-          builder_->makeUintConstant(
-              static_cast<unsigned int>(kSysFlag_XYDividedByW))),
-      const_uint_0_);
-  spv::Id guest_position_xy_mul_w = builder_->createBinOp(
-      spv::OpVectorTimesScalar, type_float2_, position_xy, position_w);
-  builder_->addDecoration(guest_position_xy_mul_w,
-                          spv::DecorationNoContraction);
-  position_xy =
-      builder_->createTriOp(spv::OpSelect, type_float2_, is_xy_divided_by_w,
-                            guest_position_xy_mul_w, position_xy);
-
-  // Check if the shader returns Z/W rather than Z, and if it does, revert that.
-  // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in
-  // affine interpolation.
-  spv::Id position_z =
-      builder_->createCompositeExtract(guest_position, type_float_, 2);
-  spv::Id is_z_divided_by_w = builder_->createBinOp(
-      spv::OpINotEqual, type_bool_,
-      builder_->createBinOp(
-          spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_,
-          builder_->makeUintConstant(
-              static_cast<unsigned int>(kSysFlag_ZDividedByW))),
-      const_uint_0_);
-  spv::Id guest_position_z_mul_w =
-      builder_->createBinOp(spv::OpFMul, type_float_, position_z, position_w);
-  builder_->addDecoration(guest_position_z_mul_w, spv::DecorationNoContraction);
-  position_z =
-      builder_->createTriOp(spv::OpSelect, type_float_, is_z_divided_by_w,
-                            guest_position_z_mul_w, position_z);
-
-  // Build XYZ of the position with W format handled.
   spv::Id position_xyz;
+
+  // Open a scope since position_xy and position_z won't be synchronized anymore
+  // after position_xyz is built and modified later.
   {
-    std::unique_ptr<spv::Instruction> composite_construct_op =
-        std::make_unique<spv::Instruction>(
-            builder_->getUniqueId(), type_float3_, spv::OpCompositeConstruct);
-    composite_construct_op->addIdOperand(position_xy);
-    composite_construct_op->addIdOperand(position_z);
-    position_xyz = composite_construct_op->getResultId();
-    builder_->getBuildPoint()->addInstruction(
-        std::move(composite_construct_op));
+    // Check if the shader returns XY/W rather than XY, and if it does, revert
+    // that.
+    uint_vector_temp_.clear();
+    uint_vector_temp_.reserve(2);
+    uint_vector_temp_.push_back(0);
+    uint_vector_temp_.push_back(1);
+    spv::Id position_xy = builder_->createRvalueSwizzle(
+        spv::NoPrecision, type_float2_, guest_position, uint_vector_temp_);
+    spv::Id is_xy_divided_by_w = builder_->createBinOp(
+        spv::OpINotEqual, type_bool_,
+        builder_->createBinOp(
+            spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_,
+            builder_->makeUintConstant(
+                static_cast<unsigned int>(kSysFlag_XYDividedByW))),
+        const_uint_0_);
+    spv::Id guest_position_xy_mul_w = builder_->createBinOp(
+        spv::OpVectorTimesScalar, type_float2_, position_xy, position_w);
+    builder_->addDecoration(guest_position_xy_mul_w,
+                            spv::DecorationNoContraction);
+    position_xy = builder_->createTriOp(
+        spv::OpSelect, type_float2_,
+        builder_->smearScalar(spv::NoPrecision, is_xy_divided_by_w,
+                              type_bool2_),
+        guest_position_xy_mul_w, position_xy);
+
+    // Check if the shader returns Z/W rather than Z, and if it does, revert
+    // that.
+    spv::Id position_z =
+        builder_->createCompositeExtract(guest_position, type_float_, 2);
+    spv::Id is_z_divided_by_w = builder_->createBinOp(
+        spv::OpINotEqual, type_bool_,
+        builder_->createBinOp(
+            spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_,
+            builder_->makeUintConstant(
+                static_cast<unsigned int>(kSysFlag_ZDividedByW))),
+        const_uint_0_);
+    spv::Id guest_position_z_mul_w =
+        builder_->createBinOp(spv::OpFMul, type_float_, position_z, position_w);
+    builder_->addDecoration(guest_position_z_mul_w,
+                            spv::DecorationNoContraction);
+    position_z =
+        builder_->createTriOp(spv::OpSelect, type_float_, is_z_divided_by_w,
+                              guest_position_z_mul_w, position_z);
+
+    // Build XYZ of the position with W format handled.
+    {
+      std::unique_ptr<spv::Instruction> composite_construct_op =
+          std::make_unique<spv::Instruction>(
+              builder_->getUniqueId(), type_float3_, spv::OpCompositeConstruct);
+      composite_construct_op->addIdOperand(position_xy);
+      composite_construct_op->addIdOperand(position_z);
+      position_xyz = composite_construct_op->getResultId();
+      builder_->getBuildPoint()->addInstruction(
+          std::move(composite_construct_op));
+    }
   }
 
   // Apply the NDC scale and offset for guest to host viewport transformation.
@@ -1382,20 +1510,6 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() {
                                        ndc_offset_mul_w);
   builder_->addDecoration(position_xyz, spv::DecorationNoContraction);
 
-  // Store the position converted to the host.
-  spv::Id position;
-  {
-    std::unique_ptr<spv::Instruction> composite_construct_op =
-        std::make_unique<spv::Instruction>(
-            builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct);
-    composite_construct_op->addIdOperand(position_xyz);
-    composite_construct_op->addIdOperand(position_w);
-    position = composite_construct_op->getResultId();
-    builder_->getBuildPoint()->addInstruction(
-        std::move(composite_construct_op));
-  }
-  builder_->createStore(position, position_ptr);
-
   // Write the point size.
   if (output_point_size_ != spv::NoResult) {
     spv::Id point_size;
@@ -1415,6 +1529,154 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() {
     }
     builder_->createStore(point_size, output_point_size_);
   }
+
+  Modification shader_modification = GetSpirvShaderModification();
+
+  // Expand the point sprite.
+  if (shader_modification.vertex.host_vertex_shader_type ==
+      Shader::HostVertexShaderType::kPointListAsTriangleStrip) {
+    // Top-left, bottom-left, top-right, bottom-right order (chosen arbitrarily,
+    // simply based on counterclockwise meaning front with
+    // frontFace = VkFrontFace(0), but faceness is ignored for non-polygon
+    // primitive types).
+    id_vector_temp_.clear();
+    id_vector_temp_.reserve(2);
+    id_vector_temp_.push_back(builder_->makeUintConstant(0b10));
+    id_vector_temp_.push_back(builder_->makeUintConstant(0b01));
+    spv::Id point_vertex_positive = builder_->createBinOp(
+        spv::OpINotEqual, type_bool2_,
+        builder_->createBinOp(
+            spv::OpBitwiseAnd, type_uint2_,
+            builder_->smearScalar(spv::NoPrecision,
+                                  builder_->createUnaryOp(
+                                      spv::OpBitcast, type_uint_,
+                                      builder_->createLoad(input_vertex_index_,
+                                                           spv::NoPrecision)),
+                                  type_uint2_),
+            builder_->createCompositeConstruct(type_uint2_, id_vector_temp_)),
+        SpirvSmearScalarResultOrConstant(const_uint_0_, type_uint2_));
+
+    // Load the point diameter in guest pixels, with the override from the
+    // vertex shader if provided.
+    id_vector_temp_.clear();
+    id_vector_temp_.push_back(
+        builder_->makeIntConstant(kSystemConstantPointConstantDiameter));
+    spv::Id point_guest_diameter = builder_->createLoad(
+        builder_->createAccessChain(spv::StorageClassUniform,
+                                    uniform_system_constants_, id_vector_temp_),
+        spv::NoPrecision);
+    if (current_shader().writes_point_size_edge_flag_kill_vertex() & 0b001) {
+      assert_true(var_main_point_size_edge_flag_kill_vertex_ != spv::NoResult);
+      id_vector_temp_.clear();
+      id_vector_temp_.push_back(const_int_0_);
+      spv::Id point_vertex_diameter = builder_->createLoad(
+          builder_->createAccessChain(
+              spv::StorageClassFunction,
+              var_main_point_size_edge_flag_kill_vertex_, id_vector_temp_),
+          spv::NoPrecision);
+      // The vertex shader's header writes -1.0 to point_size by default, so any
+      // non-negative value means that it was overwritten by the translated
+      // vertex shader, and needs to be used instead of the constant size. The
+      // per-vertex diameter has already been clamped earlier in translation
+      // (combined with making it non-negative).
+      point_guest_diameter = builder_->createTriOp(
+          spv::OpSelect, type_float2_,
+          builder_->smearScalar(
+              spv::NoPrecision,
+              builder_->createBinOp(spv::OpFOrdGreaterThanEqual, type_bool_,
+                                    point_vertex_diameter, const_float_0_),
+              type_bool2_),
+          builder_->smearScalar(spv::NoPrecision, point_vertex_diameter,
+                                type_float2_),
+          point_guest_diameter);
+    }
+    // Transform the diameter in the guest screen coordinates to radius in the
+    // normalized device coordinates.
+    id_vector_temp_.clear();
+    id_vector_temp_.push_back(builder_->makeIntConstant(
+        kSystemConstantPointScreenDiameterToNdcRadius));
+    spv::Id point_radius = builder_->createBinOp(
+        spv::OpFMul, type_float2_, point_guest_diameter,
+        builder_->createLoad(builder_->createAccessChain(
+                                 spv::StorageClassUniform,
+                                 uniform_system_constants_, id_vector_temp_),
+                             spv::NoPrecision));
+    builder_->addDecoration(point_radius, spv::DecorationNoContraction);
+    // Transform the radius from the normalized device coordinates to the clip
+    // space.
+    point_radius = builder_->createBinOp(spv::OpVectorTimesScalar, type_float2_,
+                                         point_radius, position_w);
+    builder_->addDecoration(point_radius, spv::DecorationNoContraction);
+
+    // Apply the direction of expansion for the current host vertex.
+    spv::Id point_radius_negative =
+        builder_->createUnaryOp(spv::OpFNegate, type_float2_, point_radius);
+    builder_->addDecoration(point_radius_negative,
+                            spv::DecorationNoContraction);
+    // Expand the point sprite.
+    uint_vector_temp_.clear();
+    uint_vector_temp_.reserve(2);
+    uint_vector_temp_.push_back(0);
+    uint_vector_temp_.push_back(1);
+    spv::Id point_position_xy = builder_->createBinOp(
+        spv::OpFAdd, type_float2_,
+        builder_->createRvalueSwizzle(spv::NoPrecision, type_float2_,
+                                      position_xyz, uint_vector_temp_),
+        builder_->createTriOp(spv::OpSelect, type_float2_,
+                              point_vertex_positive, point_radius,
+                              point_radius_negative));
+    builder_->addDecoration(point_position_xy, spv::DecorationNoContraction);
+    // Store the position.
+    spv::Id position;
+    {
+      // Bypass the `getNumTypeConstituents(typeId) == (int)constituents.size()`
+      // assertion in createCompositeConstruct, OpCompositeConstruct can
+      // construct vectors not only from scalars, but also from other vectors.
+      std::unique_ptr<spv::Instruction> composite_construct_op =
+          std::make_unique<spv::Instruction>(
+              builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct);
+      composite_construct_op->addIdOperand(point_position_xy);
+      composite_construct_op->addIdOperand(
+          builder_->createCompositeExtract(position_xyz, type_float_, 2));
+      composite_construct_op->addIdOperand(position_w);
+      position = composite_construct_op->getResultId();
+      builder_->getBuildPoint()->addInstruction(
+          std::move(composite_construct_op));
+    }
+    builder_->createStore(position, position_ptr);
+
+    // Write the point coordinates.
+    if (output_point_coordinates_ != spv::NoResult) {
+      builder_->createStore(
+          builder_->createTriOp(spv::OpSelect, type_float2_,
+                                point_vertex_positive, const_float2_1_,
+                                const_float2_0_),
+          output_point_coordinates_);
+    }
+
+    // TODO(Triang3l): For points, handle ps_ucp_mode (take the guest clip space
+    // coordinates instead of the host ones, calculate the distances to the user
+    // clip planes, cull using the distance from the center for modes 0, 1 and
+    // 2, cull and clip per-vertex for modes 2 and 3) in clip and cull
+    // distances.
+  } else {
+    // Store the position converted to the host.
+    spv::Id position;
+    {
+      // Bypass the `getNumTypeConstituents(typeId) == (int)constituents.size()`
+      // assertion in createCompositeConstruct, OpCompositeConstruct can
+      // construct vectors not only from scalars, but also from other vectors.
+      std::unique_ptr<spv::Instruction> composite_construct_op =
+          std::make_unique<spv::Instruction>(
+              builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct);
+      composite_construct_op->addIdOperand(position_xyz);
+      composite_construct_op->addIdOperand(position_w);
+      position = composite_construct_op->getResultId();
+      builder_->getBuildPoint()->addInstruction(
+          std::move(composite_construct_op));
+    }
+    builder_->createStore(position, position_ptr);
+  }
 }
 
 void SpirvShaderTranslator::StartFragmentShaderBeforeMain() {
diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index 733bbf2ff..3bcd342a3 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -50,7 +50,11 @@ class SpirvShaderTranslator : public ShaderTranslator {
       // Interpolators written by the vertex shader and needed by the pixel
       // shader.
       uint32_t interpolator_mask : xenos::kMaxInterpolators;
-      uint32_t output_point_size : 1;
+      // For HostVertexShaderType kPointListAsTriangleStrip, whether to output
+      // the point coordinates.
+      // For other HostVertexShaderTypes (though truly reachable only for
+      // kVertex), whether to output the point size.
+      uint32_t output_point_parameters : 1;
       // Dynamically indexable register count from SQ_PROGRAM_CNTL.
       uint32_t dynamic_addressable_register_count : 8;
       // Pipeline stage and input configuration.
@@ -655,6 +659,9 @@ class SpirvShaderTranslator : public ShaderTranslator {
   // all).
   std::array<spv::Id, xenos::kMaxInterpolators> input_output_interpolators_;
 
+  // VS, only for HostVertexShaderType::kPointListAsTriangleStrip when needed
+  // for the PS - float2.
+  spv::Id output_point_coordinates_;
   // VS, only when needed - float.
   spv::Id output_point_size_;
 
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 80affe639..68a00cbe8 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -2171,7 +2171,9 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
     // TODO(Triang3l): Tessellation, geometry-type-specific vertex shader,
     // vertex shader as compute.
     if (primitive_processing_result.host_vertex_shader_type !=
-        Shader::HostVertexShaderType::kVertex) {
+            Shader::HostVertexShaderType::kVertex &&
+        primitive_processing_result.host_vertex_shader_type !=
+            Shader::HostVertexShaderType::kPointListAsTriangleStrip) {
       return false;
     }
 
@@ -2179,7 +2181,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
     vertex_shader_modification =
         pipeline_cache_->GetCurrentVertexShaderModification(
             *vertex_shader, primitive_processing_result.host_vertex_shader_type,
-            interpolator_mask);
+            interpolator_mask, ps_param_gen_pos != UINT32_MAX);
     pixel_shader_modification =
         pixel_shader ? pipeline_cache_->GetCurrentPixelShaderModification(
                            *pixel_shader, interpolator_mask, ps_param_gen_pos)
@@ -2348,6 +2350,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
   }
 
   const ui::vulkan::VulkanProvider& provider = GetVulkanProvider();
+  const VkPhysicalDeviceFeatures& device_features = provider.device_features();
   const VkPhysicalDeviceLimits& device_limits =
       provider.device_properties().limits;
 
@@ -2382,11 +2385,23 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
   UpdateDynamicState(viewport_info, primitive_polygonal,
                      normalized_depth_control);
 
+  auto vgt_draw_initiator = regs.Get<reg::VGT_DRAW_INITIATOR>();
+
+  // Whether to load the guest 32-bit (usually big-endian) vertex index
+  // indirectly in the vertex shader if full 32-bit indices are not supported by
+  // the host.
+  bool shader_32bit_index_dma =
+      !device_features.fullDrawIndexUint32 &&
+      primitive_processing_result.index_buffer_type ==
+          PrimitiveProcessor::ProcessedIndexBufferType::kGuestDMA &&
+      vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32 &&
+      primitive_processing_result.host_vertex_shader_type ==
+          Shader::HostVertexShaderType::kVertex;
+
   // Update system constants before uploading them.
-  bool vertex_shader_index_load;
   UpdateSystemConstantValues(primitive_polygonal, primitive_processing_result,
-                             viewport_info, used_texture_mask,
-                             vertex_shader_index_load);
+                             shader_32bit_index_dma, viewport_info,
+                             used_texture_mask);
 
   // Update uniform buffers and descriptor sets after binding the pipeline with
   // the new layout.
@@ -2453,13 +2468,13 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
   // Draw.
   if (primitive_processing_result.index_buffer_type ==
           PrimitiveProcessor::ProcessedIndexBufferType::kNone ||
-      vertex_shader_index_load) {
+      shader_32bit_index_dma) {
     deferred_command_buffer_.CmdVkDraw(
         primitive_processing_result.host_draw_vertex_count, 1, 0, 0);
   } else {
     std::pair<VkBuffer, VkDeviceSize> index_buffer;
     switch (primitive_processing_result.index_buffer_type) {
-      case PrimitiveProcessor::ProcessedIndexBufferType::kGuest:
+      case PrimitiveProcessor::ProcessedIndexBufferType::kGuestDMA:
         index_buffer.first = shared_memory_->buffer();
         index_buffer.second = primitive_processing_result.guest_index_base;
         break;
@@ -2467,7 +2482,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
         index_buffer = primitive_processor_->GetConvertedIndexBuffer(
             primitive_processing_result.host_index_buffer_handle);
         break;
-      case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltin:
+      case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForAuto:
+      case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForDMA:
         index_buffer = primitive_processor_->GetBuiltinIndexBuffer(
             primitive_processing_result.host_index_buffer_handle);
         break;
@@ -3342,8 +3358,8 @@ void VulkanCommandProcessor::UpdateDynamicState(
 void VulkanCommandProcessor::UpdateSystemConstantValues(
     bool primitive_polygonal,
     const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
-    const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask,
-    bool& vertex_shader_index_load_out) {
+    bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info,
+    uint32_t used_texture_mask) {
 #if XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
 #endif  // XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES
@@ -3367,51 +3383,17 @@ void VulkanCommandProcessor::UpdateSystemConstantValues(
   // Flags.
   uint32_t flags = 0;
   // Vertex index shader loading.
-  bool vertex_shader_index_load = false;
-  // Only for ProcessedIndexBufferType kGuest since kHostConverted indices may
-  // be not loaded into the GPU memory (only read on the CPU), though
-  // kHostConverted must never be used for point lists and rectangle lists
-  // without geometry shaders anyway. For regular 32-bit index fetching without
-  // fullDrawIndexUint32, kHostConverted indices are already byte-swapped and
-  // truncated to 24 bits, so indirect fetch is not needed.
+  if (shader_32bit_index_dma) {
+    flags |= SpirvShaderTranslator::kSysFlag_VertexIndexLoad;
+  }
   if (primitive_processing_result.index_buffer_type ==
-      PrimitiveProcessor::ProcessedIndexBufferType::kGuest) {
-    switch (primitive_processing_result.host_vertex_shader_type) {
-      case Shader::HostVertexShaderType::kVertex: {
-        // For guest (usually big-endian) 32-bit indices when they're not
-        // supported by the device.
-        if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) {
-          const ui::vulkan::VulkanProvider& provider = GetVulkanProvider();
-          const VkPhysicalDeviceFeatures& device_features =
-              provider.device_features();
-          if (!device_features.fullDrawIndexUint32) {
-            vertex_shader_index_load = true;
-            flags |= SpirvShaderTranslator::kSysFlag_VertexIndexLoad;
-          }
-        }
-      } break;
-      // kMemexportCompute never comes out of the PrimitiveProcessor, as
-      // memexport compute shaders are executed alongside their vertex
-      // counterparts, since they may still result in drawing.
-      case Shader::HostVertexShaderType::kPointListAsTriangleStrip:
-      case Shader::HostVertexShaderType::kRectangleListAsTriangleStrip: {
-        // Always loading the guest index buffer indirectly if it's used, as
-        // host indexing contains a part needed specifically for the host for
-        // the construction of the primitive - host vertices don't map 1:1 to
-        // guest ones.
-        vertex_shader_index_load = true;
-        flags |=
-            SpirvShaderTranslator::kSysFlag_ComputeOrPrimitiveVertexIndexLoad;
-        if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) {
-          flags |= SpirvShaderTranslator ::
-              kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit;
-        }
-      } break;
-      default:
-        break;
+      PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForDMA) {
+    flags |= SpirvShaderTranslator::kSysFlag_ComputeOrPrimitiveVertexIndexLoad;
+    if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) {
+      flags |= SpirvShaderTranslator ::
+          kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit;
     }
   }
-  vertex_shader_index_load_out = vertex_shader_index_load;
   // W0 division control.
   // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
   // 8: VTX_XY_FMT = true: the incoming XY have already been multiplied by 1/W0.
@@ -3466,9 +3448,9 @@ void VulkanCommandProcessor::UpdateSystemConstantValues(
 
   // Index or tessellation edge factor buffer endianness.
   dirty |= system_constants_.vertex_index_endian !=
-           primitive_processing_result.host_index_endian;
+           primitive_processing_result.host_shader_index_endian;
   system_constants_.vertex_index_endian =
-      primitive_processing_result.host_index_endian;
+      primitive_processing_result.host_shader_index_endian;
 
   // Vertex index offset.
   dirty |= system_constants_.vertex_base_index != vgt_indx_offset;
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h
index f500e0718..7920981fb 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@@ -436,8 +436,8 @@ class VulkanCommandProcessor : public CommandProcessor {
   void UpdateSystemConstantValues(
       bool primitive_polygonal,
       const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
-      const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask,
-      bool& vertex_shader_index_load_out);
+      bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info,
+      uint32_t used_texture_mask);
   bool UpdateBindings(const VulkanShader* vertex_shader,
                       const VulkanShader* pixel_shader);
   // Allocates a descriptor set and fills one or two VkWriteDescriptorSet
diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc
index 7cf30e250..aff800c1a 100644
--- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc
@@ -118,7 +118,7 @@ VulkanShader* VulkanPipelineCache::LoadShader(xenos::ShaderType shader_type,
 SpirvShaderTranslator::Modification
 VulkanPipelineCache::GetCurrentVertexShaderModification(
     const Shader& shader, Shader::HostVertexShaderType host_vertex_shader_type,
-    uint32_t interpolator_mask) const {
+    uint32_t interpolator_mask, bool ps_param_gen_used) const {
   assert_true(shader.type() == xenos::ShaderType::kVertex);
   assert_true(shader.is_ucode_analyzed());
   const auto& regs = register_file_;
@@ -133,10 +133,15 @@ VulkanPipelineCache::GetCurrentVertexShaderModification(
 
   modification.vertex.interpolator_mask = interpolator_mask;
 
-  modification.vertex.output_point_size =
-      uint32_t((shader.writes_point_size_edge_flag_kill_vertex() & 0b001) &&
-               regs.Get<reg::VGT_DRAW_INITIATOR>().prim_type ==
-                   xenos::PrimitiveType::kPointList);
+  if (host_vertex_shader_type ==
+      Shader::HostVertexShaderType::kPointListAsTriangleStrip) {
+    modification.vertex.output_point_parameters = uint32_t(ps_param_gen_used);
+  } else {
+    modification.vertex.output_point_parameters =
+        uint32_t((shader.writes_point_size_edge_flag_kill_vertex() & 0b001) &&
+                 regs.Get<reg::VGT_DRAW_INITIATOR>().prim_type ==
+                     xenos::PrimitiveType::kPointList);
+  }
 
   return modification;
 }
@@ -828,6 +833,17 @@ bool VulkanPipelineCache::GetGeometryShaderKey(
   if (geometry_shader_type == PipelineGeometryShader::kNone) {
     return false;
   }
+  // For kPointListAsTriangleStrip, output_point_parameters has a different
+  // meaning (the coordinates, not the size). However, the AsTriangleStrip host
+  // vertex shader types are needed specifically when geometry shaders are not
+  // supported as fallbacks.
+  if (vertex_shader_modification.vertex.host_vertex_shader_type ==
+          Shader::HostVertexShaderType::kPointListAsTriangleStrip ||
+      vertex_shader_modification.vertex.host_vertex_shader_type ==
+          Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) {
+    assert_always();
+    return false;
+  }
   GeometryShaderKey key;
   key.type = geometry_shader_type;
   // TODO(Triang3l): Once all needed inputs and outputs are added, uncomment the
@@ -840,7 +856,8 @@ bool VulkanPipelineCache::GetGeometryShaderKey(
       /* vertex_shader_modification.vertex.user_clip_plane_cull */ 0;
   key.has_vertex_kill_and =
       /* vertex_shader_modification.vertex.vertex_kill_and */ 0;
-  key.has_point_size = vertex_shader_modification.vertex.output_point_size;
+  key.has_point_size =
+      vertex_shader_modification.vertex.output_point_parameters;
   key.has_point_coordinates = pixel_shader_modification.pixel.param_gen_point;
   key_out = key;
   return true;
diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h
index 6e0c73ab0..56346d1bc 100644
--- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h
+++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h
@@ -71,7 +71,7 @@ class VulkanPipelineCache {
   SpirvShaderTranslator::Modification GetCurrentVertexShaderModification(
       const Shader& shader,
       Shader::HostVertexShaderType host_vertex_shader_type,
-      uint32_t interpolator_mask) const;
+      uint32_t interpolator_mask, bool ps_param_gen_used) const;
   SpirvShaderTranslator::Modification GetCurrentPixelShaderModification(
       const Shader& shader, uint32_t interpolator_mask,
       uint32_t param_gen_pos) const;
diff --git a/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc b/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc
index b7f37f4b9..86b13b4ae 100644
--- a/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc
@@ -36,7 +36,9 @@ bool VulkanPrimitiveProcessor::Initialize() {
   if (!InitializeCommon(device_features.fullDrawIndexUint32,
                         !device_portability_subset_features ||
                             device_portability_subset_features->triangleFans,
-                        false, device_features.geometryShader)) {
+                        false, device_features.geometryShader,
+                        device_features.geometryShader,
+                        device_features.geometryShader)) {
     Shutdown();
     return false;
   }
@@ -127,9 +129,9 @@ void VulkanPrimitiveProcessor::EndFrame() {
   frame_index_buffers_.clear();
 }
 
-bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
-    uint32_t index_count, std::function<void(uint16_t*)> fill_callback) {
-  assert_not_zero(index_count);
+bool VulkanPrimitiveProcessor::InitializeBuiltinIndexBuffer(
+    size_t size_bytes, std::function<void(void*)> fill_callback) {
+  assert_not_zero(size_bytes);
   assert_true(builtin_index_buffer_ == VK_NULL_HANDLE);
   assert_true(builtin_index_buffer_memory_ == VK_NULL_HANDLE);
   assert_true(builtin_index_buffer_upload_ == VK_NULL_HANDLE);
@@ -140,7 +142,7 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
   const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
   VkDevice device = provider.device();
 
-  builtin_index_buffer_size_ = VkDeviceSize(sizeof(uint16_t) * index_count);
+  builtin_index_buffer_size_ = VkDeviceSize(size_bytes);
   if (!ui::vulkan::util::CreateDedicatedAllocationBuffer(
           provider, builtin_index_buffer_size_,
           VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
@@ -148,8 +150,8 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
           builtin_index_buffer_memory_)) {
     XELOGE(
         "Vulkan primitive processor: Failed to create the built-in index "
-        "buffer GPU resource with {} 16-bit indices",
-        index_count);
+        "buffer GPU resource with {} bytes",
+        size_bytes);
     return false;
   }
   uint32_t upload_memory_type;
@@ -161,8 +163,8 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
           &upload_memory_type)) {
     XELOGE(
         "Vulkan primitive processor: Failed to create the built-in index "
-        "buffer upload resource with {} 16-bit indices",
-        index_count);
+        "buffer upload resource with {} bytes",
+        size_bytes);
     ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyBuffer, device,
                                            builtin_index_buffer_);
     ui::vulkan::util::DestroyAndNullHandle(dfn.vkFreeMemory, device,
@@ -175,8 +177,8 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
                       VK_WHOLE_SIZE, 0, &mapping) != VK_SUCCESS) {
     XELOGE(
         "Vulkan primitive processor: Failed to map the built-in index buffer "
-        "upload resource with {} 16-bit indices",
-        index_count);
+        "upload resource with {} bytes",
+        size_bytes);
     ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyBuffer, device,
                                            builtin_index_buffer_upload_);
     ui::vulkan::util::DestroyAndNullHandle(dfn.vkFreeMemory, device,
@@ -187,7 +189,7 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
                                            builtin_index_buffer_memory_);
     return false;
   }
-  fill_callback(reinterpret_cast<uint16_t*>(mapping));
+  fill_callback(mapping);
   ui::vulkan::util::FlushMappedMemoryRange(
       provider, builtin_index_buffer_memory_, upload_memory_type);
   dfn.vkUnmapMemory(device, builtin_index_buffer_upload_memory_);
diff --git a/src/xenia/gpu/vulkan/vulkan_primitive_processor.h b/src/xenia/gpu/vulkan/vulkan_primitive_processor.h
index 50e729577..ea8ed4fed 100644
--- a/src/xenia/gpu/vulkan/vulkan_primitive_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_primitive_processor.h
@@ -56,9 +56,8 @@ class VulkanPrimitiveProcessor final : public PrimitiveProcessor {
   }
 
  protected:
-  bool InitializeBuiltin16BitIndexBuffer(
-      uint32_t index_count,
-      std::function<void(uint16_t*)> fill_callback) override;
+  bool InitializeBuiltinIndexBuffer(
+      size_t size_bytes, std::function<void(void*)> fill_callback) override;
 
   void* RequestHostConvertedIndexBufferForCurrentFrame(
       xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,