[Vulkan] Use UDiv/UMod by constant tile size + minor transfer cleanup

Drivers compile that to a multiplication and a shift anyway.
2022-06-20 22:24:07 +03:00 · 2022-06-20 22:24:07 +03:00 · c6ec6d8239
parent 61c4c49d76
commit c6ec6d8239
1 changed files with 41 additions and 86 deletions
--- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc
+++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc
@ -1606,7 +1606,7 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
    id_vector_temp.push_back(builder.makeRuntimeArray(type_uint));
    // Storage buffers have std430 packing, no padding to 4-component vectors.
    builder.addDecoration(id_vector_temp.back(), spv::DecorationArrayStride,
-                          sizeof(float));
+                          sizeof(uint32_t));
    spv::Id type_host_depth_source_buffer =
        builder.makeStructType(id_vector_temp, "XeTransferHostDepthBuffer");
    builder.addMemberName(type_host_depth_source_buffer, 0, "host_depth");
@ -1754,12 +1754,19 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
  // Working with unsigned numbers for simplicity now, bitcasting to signed will
  // be done at texture fetch.

-  uint32_t tile_width_samples_scaled =
+  uint32_t tile_width_samples =
      xenos::kEdramTileWidthSamples * draw_resolution_scale_x();
-  uint32_t tile_height_samples_scaled =
+  uint32_t tile_height_samples =
      xenos::kEdramTileHeightSamples * draw_resolution_scale_y();

-  // Convert the fragment coordinates to uint2.
+  // Split the destination pixel index into 32bpp tile and 32bpp-tile-relative
+  // pixel index.
+  // Note that division by non-power-of-two constants will include a 4-cycle
+  // 32*32 multiplication on AMD, even though so many bits are not needed for
+  // the pixel position - however, if an OpUnreachable path is inserted for the
+  // case when the position has upper bits set, for some reason, the code for it
+  // is not eliminated when compiling the shader for AMD via RenderDoc on
+  // Windows, as of June 2022.
  uint_vector_temp.clear();
  uint_vector_temp.reserve(2);
  uint_vector_temp.push_back(0);
@ -1770,77 +1777,25 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
          spv::NoPrecision, type_float2,
          builder.createLoad(input_fragment_coord, spv::NoPrecision),
          uint_vector_temp));
-
-  // Prove to the AMD compiler that 24*24 multiplication can be done. 16 bits
-  // are more than enough for coordinates even with 3x resolution scaling (and
-  // Direct3D 11 hardware has 16.8 fixed-point coordinates).
-  // TODO(Triang3l): OpUnreachable if the coordinates have upper bits set.
-
-  // Split the destination pixel coordinate into scalars.
  spv::Id dest_pixel_x =
      builder.createCompositeExtract(dest_pixel_coord, type_uint, 0);
+  spv::Id const_dest_tile_width_pixels = builder.makeUintConstant(
+      tile_width_samples >>
+      (uint32_t(dest_is_64bpp) +
+       uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X)));
+  spv::Id dest_tile_index_x = builder.createBinOp(
+      spv::OpUDiv, type_uint, dest_pixel_x, const_dest_tile_width_pixels);
+  spv::Id dest_tile_pixel_x = builder.createBinOp(
+      spv::OpUMod, type_uint, dest_pixel_x, const_dest_tile_width_pixels);
  spv::Id dest_pixel_y =
      builder.createCompositeExtract(dest_pixel_coord, type_uint, 1);
-
-  // Split the destination pixel index into 32bpp tile and 32bpp-tile-relative
-  // pixel index.
-  uint32_t dest_sample_width_log2 =
-      uint32_t(dest_is_64bpp) +
-      uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X);
-  uint32_t dest_sample_height_log2 =
-      uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X);
-  uint32_t dest_tile_width_divide_scale, dest_tile_width_divide_shift;
-  draw_util::GetEdramTileWidthDivideScaleAndUpperShift(
-      draw_resolution_scale_x(), dest_tile_width_divide_scale,
-      dest_tile_width_divide_shift);
-  // Doing 16*16=32 multiplication, not 32*32=64.
-  // TODO(Triang3l): Abstract this away, don't do 32*32 on Direct3D 12 too.
-  dest_tile_width_divide_scale &= UINT16_MAX;
-  dest_tile_width_divide_shift += 16;
-  // Need the host tile size in pixels, not samples.
-  dest_tile_width_divide_shift -= dest_sample_width_log2;
-  spv::Id dest_tile_index_x = builder.createBinOp(
-      spv::OpShiftRightLogical, type_uint,
-      builder.createBinOp(
-          spv::OpIMul, type_uint, dest_pixel_x,
-          builder.makeUintConstant(dest_tile_width_divide_scale)),
-      builder.makeUintConstant(dest_tile_width_divide_shift));
-  spv::Id dest_tile_pixel_x = builder.createBinOp(
-      spv::OpISub, type_uint, dest_pixel_x,
-      builder.createBinOp(spv::OpIMul, type_uint, dest_tile_index_x,
-                          builder.makeUintConstant(tile_width_samples_scaled >>
-                                                   dest_sample_width_log2)));
-  spv::Id dest_tile_index_y, dest_tile_pixel_y;
-  static_assert(
-      TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3,
-      "VulkanRenderTargetCache EDRAM range ownership transfer shader "
-      "generation supports Y draw resolution scaling factors of only up to 3");
-  if (draw_resolution_scale_y() == 3) {
-    dest_tile_index_y = builder.createBinOp(
-        spv::OpShiftRightLogical, type_uint,
-        builder.createBinOp(
-            spv::OpIMul, type_uint, dest_pixel_y,
-            builder.makeUintConstant(draw_util::kDivideScale3 & UINT16_MAX)),
-        builder.makeUintConstant(draw_util::kDivideUpperShift3 + 16 + 4 -
-                                 dest_sample_height_log2));
-    dest_tile_pixel_y = builder.createBinOp(
-        spv::OpISub, type_uint, dest_pixel_y,
-        builder.createBinOp(
-            spv::OpIMul, type_uint, dest_tile_index_y,
-            builder.makeUintConstant(tile_height_samples_scaled >>
-                                     dest_sample_height_log2)));
-  } else {
-    assert_true(draw_resolution_scale_y() <= 2);
-    uint32_t dest_tile_height_pixels_log2 =
-        (draw_resolution_scale_y() == 2 ? 5 : 4) - dest_sample_height_log2;
-    dest_tile_index_y = builder.createBinOp(
-        spv::OpShiftRightLogical, type_uint, dest_pixel_y,
-        builder.makeUintConstant(dest_tile_height_pixels_log2));
-    dest_tile_pixel_y = builder.createBinOp(
-        spv::OpBitwiseAnd, type_uint, dest_pixel_y,
-        builder.makeUintConstant((uint32_t(1) << dest_tile_height_pixels_log2) -
-                                 1));
-  }
+  spv::Id const_dest_tile_height_pixels = builder.makeUintConstant(
+      tile_height_samples >>
+      uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X));
+  spv::Id dest_tile_index_y = builder.createBinOp(
+      spv::OpUDiv, type_uint, dest_pixel_y, const_dest_tile_height_pixels);
+  spv::Id dest_tile_pixel_y = builder.createBinOp(
+      spv::OpUMod, type_uint, dest_pixel_y, const_dest_tile_height_pixels);

  assert_true(push_constants_member_address != UINT32_MAX);
  id_vector_temp.clear();
@ -2269,7 +2224,7 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
    // Copying between color and depth / stencil - swap 40-32bpp-sample columns
    // in the pixel index within the source 32bpp tile.
    uint32_t source_32bpp_tile_half_pixels =
-        tile_width_samples_scaled >> (1 + source_pixel_width_dwords_log2);
+        tile_width_samples >> (1 + source_pixel_width_dwords_log2);
    source_tile_pixel_x = builder.createUnaryOp(
        spv::OpBitcast, type_uint,
        builder.createBinOp(
@ -2315,7 +2270,7 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
          spv::OpIAdd, type_uint,
          builder.createBinOp(
              spv::OpIMul, type_uint,
-              builder.makeUintConstant(tile_width_samples_scaled >>
+              builder.makeUintConstant(tile_width_samples >>
                                       source_pixel_width_dwords_log2),
              source_tile_index_x),
          source_tile_pixel_x));
@ -2326,7 +2281,7 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
          builder.createBinOp(
              spv::OpIMul, type_uint,
              builder.makeUintConstant(
-                  tile_height_samples_scaled >>
+                  tile_height_samples >>
                  uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k2X)),
              source_tile_index_y),
          source_tile_pixel_y));
@ -2688,8 +2643,8 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
        switch (source_depth_format) {
          case xenos::DepthRenderTargetFormat::kD24S8: {
            // Round to the nearest even integer. This seems to be the
-            // correct, adding +0.5 and rounding towards zero results in red
-            // instead of black in the 4D5307E6 clear shader.
+            // correct conversion, adding +0.5 and rounding towards zero results
+            // in red instead of black in the 4D5307E6 clear shader.
            id_vector_temp.clear();
            id_vector_temp.push_back(builder.createBinOp(
                spv::OpFMul, type_float, source_depth_float[i],
@ -3003,9 +2958,9 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
      } else {
        switch (source_depth_format) {
          case xenos::DepthRenderTargetFormat::kD24S8: {
-            // Round to the nearest even integer. This seems to be the correct,
-            // adding +0.5 and rounding towards zero results in red instead of
-            // black in the 4D5307E6 clear shader.
+            // Round to the nearest even integer. This seems to be the correct
+            // conversion, adding +0.5 and rounding towards zero results in red
+            // instead of black in the 4D5307E6 clear shader.
            id_vector_temp.clear();
            id_vector_temp.push_back(builder.createBinOp(
                spv::OpFMul, type_float, source_depth_float[0],
@ -3384,7 +3339,7 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
                    spv::OpIAdd, type_uint,
                    builder.createBinOp(spv::OpIMul, type_uint,
                                        builder.makeUintConstant(
-                                            tile_width_samples_scaled >>
+                                            tile_width_samples >>
                                            uint32_t(key.source_msaa_samples >=
                                                     xenos::MsaaSamples::k4X)),
                                        host_depth_source_tile_index_x),
@ -3395,7 +3350,7 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
                    spv::OpIAdd, type_uint,
                    builder.createBinOp(spv::OpIMul, type_uint,
                                        builder.makeUintConstant(
-                                            tile_height_samples_scaled >>
+                                            tile_height_samples >>
                                            uint32_t(key.source_msaa_samples >=
                                                     xenos::MsaaSamples::k2X)),
                                        host_depth_source_tile_index_y),
@ -3469,14 +3424,14 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
                spv::OpIAdd, type_uint,
                builder.createBinOp(
                    spv::OpIMul, type_uint,
-                    builder.makeUintConstant(tile_width_samples_scaled *
-                                             tile_height_samples_scaled),
+                    builder.makeUintConstant(tile_width_samples *
+                                             tile_height_samples),
                    dest_tile_index),
                builder.createBinOp(
                    spv::OpIAdd, type_uint,
                    builder.createBinOp(
                        spv::OpIMul, type_uint,
-                        builder.makeUintConstant(tile_width_samples_scaled),
+                        builder.makeUintConstant(tile_width_samples),
                        dest_tile_sample_y),
                    dest_tile_sample_x));
            id_vector_temp.clear();
@ -3505,8 +3460,8 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader(
            switch (dest_depth_format) {
              case xenos::DepthRenderTargetFormat::kD24S8: {
                // Round to the nearest even integer. This seems to be the
-                // correct, adding +0.5 and rounding towards zero results in red
-                // instead of black in the 4D5307E6 clear shader.
+                // correct conversion, adding +0.5 and rounding towards zero
+                // results in red instead of black in the 4D5307E6 clear shader.
                id_vector_temp.clear();
                id_vector_temp.push_back(builder.createBinOp(
                    spv::OpFMul, type_float, host_depth32,