From a94301d967b5a0fb9753df8b70a1fbdd21b13e71 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Wed, 18 Nov 2020 12:48:12 +0300
Subject: [PATCH] [Vulkan] Viewport from draw_util and vtx_fmt

---
 src/xenia/gpu/spirv_shader_translator.cc      | 130 ++++++++++++++++++
 src/xenia/gpu/spirv_shader_translator.h       |  24 ++++
 .../gpu/vulkan/vulkan_command_processor.cc    | 127 ++++++++++-------
 .../gpu/vulkan/vulkan_command_processor.h     |   6 +-
 4 files changed, 232 insertions(+), 55 deletions(-)

diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc
index 69d1c04ac..de9c6c969 100644
--- a/src/xenia/gpu/spirv_shader_translator.cc
+++ b/src/xenia/gpu/spirv_shader_translator.cc
@@ -168,10 +168,13 @@ void SpirvShaderTranslator::StartTranslation() {
     spv::Id type;
   };
   const SystemConstant system_constants[] = {
+      {"flags", offsetof(SystemConstants, flags), type_uint_},
       {"vertex_index_endian", offsetof(SystemConstants, vertex_index_endian),
        type_uint_},
       {"vertex_base_index", offsetof(SystemConstants, vertex_base_index),
        type_int_},
+      {"ndc_scale", offsetof(SystemConstants, ndc_scale), type_float3_},
+      {"ndc_offset", offsetof(SystemConstants, ndc_offset), type_float3_},
   };
   id_vector_temp_.clear();
   id_vector_temp_.reserve(xe::countof(system_constants));
@@ -997,6 +1000,133 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() {
 }
 
 void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() {
+  id_vector_temp_.clear();
+  id_vector_temp_.push_back(builder_->makeIntConstant(kSystemConstantFlags));
+  spv::Id system_constant_flags = builder_->createLoad(
+      builder_->createAccessChain(spv::StorageClassUniform,
+                                  uniform_system_constants_, id_vector_temp_),
+      spv::NoPrecision);
+
+  id_vector_temp_.clear();
+  id_vector_temp_.push_back(
+      builder_->makeIntConstant(kOutputPerVertexMemberPosition));
+  spv::Id position_ptr = builder_->createAccessChain(
+      spv::StorageClassOutput, output_per_vertex_, id_vector_temp_);
+  spv::Id guest_position = builder_->createLoad(position_ptr, spv::NoPrecision);
+
+  // Check if the shader already returns W, not 1/W, and if it doesn't, turn 1/W
+  // into W.
+  spv::Id position_w =
+      builder_->createCompositeExtract(guest_position, type_float_, 3);
+  spv::Id is_w_not_reciprocal = builder_->createBinOp(
+      spv::OpINotEqual, type_bool_,
+      builder_->createBinOp(
+          spv::OpBitwiseAnd, type_uint_, system_constant_flags,
+          builder_->makeUintConstant(
+              static_cast<unsigned int>(kSysFlag_WNotReciprocal))),
+      const_uint_0_);
+  spv::Id guest_position_w_inv = builder_->createBinOp(
+      spv::OpFDiv, type_float_, const_float_1_, position_w);
+  builder_->addDecoration(guest_position_w_inv, spv::DecorationNoContraction);
+  position_w =
+      builder_->createTriOp(spv::OpSelect, type_float_, is_w_not_reciprocal,
+                            position_w, guest_position_w_inv);
+
+  // Check if the shader returns XY/W rather than XY, and if it does, revert
+  // that.
+  // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in
+  // affine interpolation.
+  uint_vector_temp_.clear();
+  uint_vector_temp_.reserve(2);
+  uint_vector_temp_.push_back(0);
+  uint_vector_temp_.push_back(1);
+  spv::Id position_xy = builder_->createRvalueSwizzle(
+      spv::NoPrecision, type_float2_, guest_position, uint_vector_temp_);
+  spv::Id is_xy_divided_by_w = builder_->createBinOp(
+      spv::OpINotEqual, type_bool_,
+      builder_->createBinOp(
+          spv::OpBitwiseAnd, type_uint_, system_constant_flags,
+          builder_->makeUintConstant(
+              static_cast<unsigned int>(kSysFlag_XYDividedByW))),
+      const_uint_0_);
+  spv::Id guest_position_xy_mul_w = builder_->createBinOp(
+      spv::OpVectorTimesScalar, type_float2_, position_xy, position_w);
+  builder_->addDecoration(guest_position_xy_mul_w,
+                          spv::DecorationNoContraction);
+  position_xy =
+      builder_->createTriOp(spv::OpSelect, type_float2_, is_xy_divided_by_w,
+                            guest_position_xy_mul_w, position_xy);
+
+  // Check if the shader returns Z/W rather than Z, and if it does, revert that.
+  // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in
+  // affine interpolation.
+  spv::Id position_z =
+      builder_->createCompositeExtract(guest_position, type_float_, 2);
+  spv::Id is_z_divided_by_w = builder_->createBinOp(
+      spv::OpINotEqual, type_bool_,
+      builder_->createBinOp(
+          spv::OpBitwiseAnd, type_uint_, system_constant_flags,
+          builder_->makeUintConstant(
+              static_cast<unsigned int>(kSysFlag_ZDividedByW))),
+      const_uint_0_);
+  spv::Id guest_position_z_mul_w =
+      builder_->createBinOp(spv::OpFMul, type_float_, position_z, position_w);
+  builder_->addDecoration(guest_position_z_mul_w, spv::DecorationNoContraction);
+  position_z =
+      builder_->createTriOp(spv::OpSelect, type_float_, is_z_divided_by_w,
+                            guest_position_z_mul_w, position_z);
+
+  // Build XYZ of the position with W format handled.
+  spv::Id position_xyz;
+  {
+    std::unique_ptr<spv::Instruction> composite_construct_op =
+        std::make_unique<spv::Instruction>(
+            builder_->getUniqueId(), type_float3_, spv::OpCompositeConstruct);
+    composite_construct_op->addIdOperand(position_xy);
+    composite_construct_op->addIdOperand(position_z);
+    position_xyz = composite_construct_op->getResultId();
+    builder_->getBuildPoint()->addInstruction(
+        std::move(composite_construct_op));
+  }
+
+  // Apply the NDC scale and offset for guest to host viewport transformation.
+  id_vector_temp_.clear();
+  id_vector_temp_.push_back(builder_->makeIntConstant(kSystemConstantNdcScale));
+  spv::Id ndc_scale = builder_->createLoad(
+      builder_->createAccessChain(spv::StorageClassUniform,
+                                  uniform_system_constants_, id_vector_temp_),
+      spv::NoPrecision);
+  position_xyz =
+      builder_->createBinOp(spv::OpFMul, type_float3_, position_xyz, ndc_scale);
+  builder_->addDecoration(position_xyz, spv::DecorationNoContraction);
+  id_vector_temp_.clear();
+  id_vector_temp_.push_back(
+      builder_->makeIntConstant(kSystemConstantNdcOffset));
+  spv::Id ndc_offset = builder_->createLoad(
+      builder_->createAccessChain(spv::StorageClassUniform,
+                                  uniform_system_constants_, id_vector_temp_),
+      spv::NoPrecision);
+  spv::Id ndc_offset_mul_w = builder_->createBinOp(
+      spv::OpVectorTimesScalar, type_float3_, ndc_offset, position_w);
+  builder_->addDecoration(ndc_offset_mul_w, spv::DecorationNoContraction);
+  position_xyz = builder_->createBinOp(spv::OpFAdd, type_float3_, position_xyz,
+                                       ndc_offset_mul_w);
+  builder_->addDecoration(position_xyz, spv::DecorationNoContraction);
+
+  // Store the position converted to the host.
+  spv::Id position;
+  {
+    std::unique_ptr<spv::Instruction> composite_construct_op =
+        std::make_unique<spv::Instruction>(
+            builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct);
+    composite_construct_op->addIdOperand(position_xyz);
+    composite_construct_op->addIdOperand(position_w);
+    position = composite_construct_op->getResultId();
+    builder_->getBuildPoint()->addInstruction(
+        std::move(composite_construct_op));
+  }
+  builder_->createStore(position, position_ptr);
+
   // Write 1 to point size (using a geometry shader or another kind of fallback
   // to expand point sprites - point size support is not guaranteed, and the
   // size would also be limited, and can't be controlled independently along two
diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h
index d4c32dda2..fadcf2a6b 100644
--- a/src/xenia/gpu/spirv_shader_translator.h
+++ b/src/xenia/gpu/spirv_shader_translator.h
@@ -25,12 +25,33 @@ namespace gpu {
 
 class SpirvShaderTranslator : public ShaderTranslator {
  public:
+  enum : uint32_t {
+    kSysFlag_XYDividedByW_Shift,
+    kSysFlag_ZDividedByW_Shift,
+    kSysFlag_WNotReciprocal_Shift,
+
+    kSysFlag_Count,
+
+    kSysFlag_XYDividedByW = 1u << kSysFlag_XYDividedByW_Shift,
+    kSysFlag_ZDividedByW = 1u << kSysFlag_ZDividedByW_Shift,
+    kSysFlag_WNotReciprocal = 1u << kSysFlag_WNotReciprocal_Shift,
+  };
+  static_assert(kSysFlag_Count <= 32, "Too many flags in the system constants");
+
   // IF SYSTEM CONSTANTS ARE CHANGED OR ADDED, THE FOLLOWING MUST BE UPDATED:
   // - SystemConstantIndex enum.
   // - Structure members in BeginTranslation.
   struct SystemConstants {
+    uint32_t flags;
     xenos::Endian vertex_index_endian;
     int32_t vertex_base_index;
+    uint32_t padding_vertex_base_index;
+
+    float ndc_scale[3];
+    uint32_t padding_ndc_scale;
+
+    float ndc_offset[3];
+    uint32_t padding_ndc_offset;
   };
 
   // The minimum limit for maxPerStageDescriptorStorageBuffers is 4, and for
@@ -329,8 +350,11 @@ class SpirvShaderTranslator : public ShaderTranslator {
   spv::Id const_float2_0_1_;
 
   enum SystemConstantIndex : unsigned int {
+    kSystemConstantFlags,
     kSystemConstantIndexVertexIndexEndian,
     kSystemConstantIndexVertexBaseIndex,
+    kSystemConstantNdcScale,
+    kSystemConstantNdcOffset,
   };
   spv::Id uniform_system_constants_;
   spv::Id uniform_float_constants_;
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 6b5a51006..9b4d598f1 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -686,14 +686,45 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
     current_graphics_pipeline_layout_ = pipeline_layout;
   }
 
+  const RegisterFile& regs = *register_file_;
+  const ui::vulkan::VulkanProvider& provider =
+      GetVulkanContext().GetVulkanProvider();
+  const VkPhysicalDeviceProperties& device_properties =
+      provider.device_properties();
+
+  // Get dynamic rasterizer state.
+  draw_util::ViewportInfo viewport_info;
+  // Just handling maxViewportDimensions is enough - viewportBoundsRange[1] must
+  // be at least 2 * max(maxViewportDimensions[0...1]) - 1, and
+  // maxViewportDimensions must be greater than or equal to the size of the
+  // largest possible framebuffer attachment (if the viewport has positive
+  // offset and is between maxViewportDimensions and viewportBoundsRange[1],
+  // GetHostViewportInfo will adjust ndc_scale/ndc_offset to clamp it, and the
+  // clamped range will be outside the largest possible framebuffer anyway.
+  // TODO(Triang3l): Possibly handle maxViewportDimensions and
+  // viewportBoundsRange separately because when using fragment shader
+  // interlocks, framebuffers are not used, while the range may be wider than
+  // dimensions? Though viewport bigger than 4096 - the smallest possible
+  // maximum dimension (which is below the 8192 texture size limit on the Xbox
+  // 360) - and with offset, is probably a situation that never happens in real
+  // life. Or even disregard the viewport bounds range in the fragment shader
+  // interlocks case completely - apply the viewport and the scissor offset
+  // directly to pixel address and to things like ps_param_gen.
+  draw_util::GetHostViewportInfo(
+      regs, 1.0f, 1.0f, false,
+      float(device_properties.limits.maxViewportDimensions[0]),
+      float(device_properties.limits.maxViewportDimensions[1]), true,
+      viewport_info);
+
   // Update fixed-function dynamic state.
-  UpdateFixedFunctionState();
+  UpdateFixedFunctionState(viewport_info);
 
   bool indexed = index_buffer_info != nullptr && index_buffer_info->guest_base;
 
   // Update system constants before uploading them.
-  UpdateSystemConstantValues(indexed ? index_buffer_info->endianness
-                                     : xenos::Endian::kNone);
+  UpdateSystemConstantValues(
+      indexed ? index_buffer_info->endianness : xenos::Endian::kNone,
+      viewport_info);
 
   // Update uniform buffers and descriptor sets after binding the pipeline with
   // the new layout.
@@ -701,8 +732,6 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
     return false;
   }
 
-  const RegisterFile& regs = *register_file_;
-
   // Ensure vertex buffers are resident.
   // TODO(Triang3l): Cache residency for ranges in a way similar to how texture
   // validity is tracked.
@@ -1229,7 +1258,8 @@ VkShaderStageFlags VulkanCommandProcessor::GetGuestVertexShaderStageFlags()
   return stages;
 }
 
-void VulkanCommandProcessor::UpdateFixedFunctionState() {
+void VulkanCommandProcessor::UpdateFixedFunctionState(
+    const draw_util::ViewportInfo& viewport_info) {
 #if XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
 #endif  // XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES
@@ -1245,53 +1275,13 @@ void VulkanCommandProcessor::UpdateFixedFunctionState() {
   uint32_t pixel_size_x = 1, pixel_size_y = 1;
 
   // Viewport.
-  // PA_CL_VTE_CNTL contains whether offsets and scales are enabled.
-  // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
-  // In games, either all are enabled (for regular drawing) or none are (for
-  // rectangle lists usually).
-  //
-  // If scale/offset is enabled, the Xenos shader is writing (neglecting W
-  // division) position in the NDC (-1, -1, dx_clip_space_def - 1) -> (1, 1, 1)
-  // box. If it's not, the position is in screen space. Since we can only use
-  // the NDC in PC APIs, we use a viewport of the largest possible size, and
-  // divide the position by it in translated shaders.
-  //
-  // TODO(Triang3l): Move all of this to draw_util.
-  // TODO(Triang3l): Limit the viewport if exceeding the device limit; move to
-  // NDC scale/offset constants.
-  auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
-  float viewport_scale_x =
-      pa_cl_vte_cntl.vport_x_scale_ena
-          ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32)
-          : 4096.0f;
-  float viewport_scale_y =
-      pa_cl_vte_cntl.vport_y_scale_ena
-          ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
-          : 4096.0f;
-  float viewport_scale_z = pa_cl_vte_cntl.vport_z_scale_ena
-                               ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32
-                               : 1.0f;
-  float viewport_offset_x = pa_cl_vte_cntl.vport_x_offset_ena
-                                ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
-                                : std::abs(viewport_scale_x);
-  float viewport_offset_y = pa_cl_vte_cntl.vport_y_offset_ena
-                                ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
-                                : std::abs(viewport_scale_y);
-  float viewport_offset_z = pa_cl_vte_cntl.vport_z_offset_ena
-                                ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32
-                                : 0.0f;
-  if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
-    viewport_offset_x += float(pa_sc_window_offset.window_x_offset);
-    viewport_offset_y += float(pa_sc_window_offset.window_y_offset);
-  }
   VkViewport viewport;
-  viewport.x = (viewport_offset_x - viewport_scale_x) * float(pixel_size_x);
-  viewport.y = (viewport_offset_y - viewport_scale_y) * float(pixel_size_y);
-  viewport.width = viewport_scale_x * 2.0f * float(pixel_size_x);
-  viewport.height = viewport_scale_y * 2.0f * float(pixel_size_y);
-  viewport.minDepth = std::min(std::max(viewport_offset_z, 0.0f), 1.0f);
-  viewport.maxDepth =
-      std::min(std::max(viewport_offset_z + viewport_scale_z, 0.0f), 1.0f);
+  viewport.x = viewport_info.left;
+  viewport.y = viewport_info.top;
+  viewport.width = viewport_info.width;
+  viewport.height = viewport_info.height;
+  viewport.minDepth = viewport_info.z_min;
+  viewport.maxDepth = viewport_info.z_max;
   ff_viewport_update_needed_ |= ff_viewport_.x != viewport.x;
   ff_viewport_update_needed_ |= ff_viewport_.y != viewport.y;
   ff_viewport_update_needed_ |= ff_viewport_.width != viewport.width;
@@ -1326,16 +1316,39 @@ void VulkanCommandProcessor::UpdateFixedFunctionState() {
 }
 
 void VulkanCommandProcessor::UpdateSystemConstantValues(
-    xenos::Endian index_endian) {
+    xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info) {
 #if XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES
   SCOPE_profile_cpu_f("gpu");
 #endif  // XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES
 
   const RegisterFile& regs = *register_file_;
+  auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
   int32_t vgt_indx_offset = int32_t(regs[XE_GPU_REG_VGT_INDX_OFFSET].u32);
 
   bool dirty = false;
 
+  // Flags.
+  uint32_t flags = 0;
+  // W0 division control.
+  // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
+  // 8: VTX_XY_FMT = true: the incoming XY have already been multiplied by 1/W0.
+  //               = false: multiply the X, Y coordinates by 1/W0.
+  // 9: VTX_Z_FMT = true: the incoming Z has already been multiplied by 1/W0.
+  //              = false: multiply the Z coordinate by 1/W0.
+  // 10: VTX_W0_FMT = true: the incoming W0 is not 1/W0. Perform the reciprocal
+  //                        to get 1/W0.
+  if (pa_cl_vte_cntl.vtx_xy_fmt) {
+    flags |= SpirvShaderTranslator::kSysFlag_XYDividedByW;
+  }
+  if (pa_cl_vte_cntl.vtx_z_fmt) {
+    flags |= SpirvShaderTranslator::kSysFlag_ZDividedByW;
+  }
+  if (pa_cl_vte_cntl.vtx_w0_fmt) {
+    flags |= SpirvShaderTranslator::kSysFlag_WNotReciprocal;
+  }
+  dirty |= system_constants_.flags != flags;
+  system_constants_.flags = flags;
+
   // Index or tessellation edge factor buffer endianness.
   dirty |= system_constants_.vertex_index_endian != index_endian;
   system_constants_.vertex_index_endian = index_endian;
@@ -1344,6 +1357,14 @@ void VulkanCommandProcessor::UpdateSystemConstantValues(
   dirty |= system_constants_.vertex_base_index != vgt_indx_offset;
   system_constants_.vertex_base_index = vgt_indx_offset;
 
+  // Conversion to host normalized device coordinates.
+  for (uint32_t i = 0; i < 3; ++i) {
+    dirty |= system_constants_.ndc_scale[i] != viewport_info.ndc_scale[i];
+    dirty |= system_constants_.ndc_offset[i] != viewport_info.ndc_offset[i];
+    system_constants_.ndc_scale[i] = viewport_info.ndc_scale[i];
+    system_constants_.ndc_offset[i] = viewport_info.ndc_offset[i];
+  }
+
   if (dirty) {
     current_graphics_descriptor_set_values_up_to_date_ &=
         ~(uint32_t(1) << SpirvShaderTranslator::kDescriptorSetSystemConstants);
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h
index a7283d56f..e083b3755 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.h
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "xenia/gpu/command_processor.h"
+#include "xenia/gpu/draw_util.h"
 #include "xenia/gpu/spirv_shader_translator.h"
 #include "xenia/gpu/vulkan/deferred_command_buffer.h"
 #include "xenia/gpu/vulkan/vulkan_graphics_system.h"
@@ -170,8 +171,9 @@ class VulkanCommandProcessor : public CommandProcessor {
 
   VkShaderStageFlags GetGuestVertexShaderStageFlags() const;
 
-  void UpdateFixedFunctionState();
-  void UpdateSystemConstantValues(xenos::Endian index_endian);
+  void UpdateFixedFunctionState(const draw_util::ViewportInfo& viewport_info);
+  void UpdateSystemConstantValues(xenos::Endian index_endian,
+                                  const draw_util::ViewportInfo& viewport_info);
   bool UpdateBindings(const VulkanShader* vertex_shader,
                       const VulkanShader* pixel_shader);
   // Allocates a descriptor, space in the uniform buffer pool, and fills the