From c6ec6d8239f2364c5103b8acdc005050279a6a1b Mon Sep 17 00:00:00 2001 From: Triang3l Date: Mon, 20 Jun 2022 22:24:07 +0300 Subject: [PATCH] [Vulkan] Use UDiv/UMod by constant tile size + minor transfer cleanup Drivers compile that to a multiplication and a shift anyway. --- .../gpu/vulkan/vulkan_render_target_cache.cc | 127 ++++++------------ 1 file changed, 41 insertions(+), 86 deletions(-) diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc index d979c5748..bf2ed52b7 100644 --- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc @@ -1606,7 +1606,7 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( id_vector_temp.push_back(builder.makeRuntimeArray(type_uint)); // Storage buffers have std430 packing, no padding to 4-component vectors. builder.addDecoration(id_vector_temp.back(), spv::DecorationArrayStride, - sizeof(float)); + sizeof(uint32_t)); spv::Id type_host_depth_source_buffer = builder.makeStructType(id_vector_temp, "XeTransferHostDepthBuffer"); builder.addMemberName(type_host_depth_source_buffer, 0, "host_depth"); @@ -1754,12 +1754,19 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( // Working with unsigned numbers for simplicity now, bitcasting to signed will // be done at texture fetch. - uint32_t tile_width_samples_scaled = + uint32_t tile_width_samples = xenos::kEdramTileWidthSamples * draw_resolution_scale_x(); - uint32_t tile_height_samples_scaled = + uint32_t tile_height_samples = xenos::kEdramTileHeightSamples * draw_resolution_scale_y(); - // Convert the fragment coordinates to uint2. + // Split the destination pixel index into 32bpp tile and 32bpp-tile-relative + // pixel index. + // Note that division by non-power-of-two constants will include a 4-cycle + // 32*32 multiplication on AMD, even though so many bits are not needed for + // the pixel position - however, if an OpUnreachable path is inserted for the + // case when the position has upper bits set, for some reason, the code for it + // is not eliminated when compiling the shader for AMD via RenderDoc on + // Windows, as of June 2022. uint_vector_temp.clear(); uint_vector_temp.reserve(2); uint_vector_temp.push_back(0); @@ -1770,77 +1777,25 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( spv::NoPrecision, type_float2, builder.createLoad(input_fragment_coord, spv::NoPrecision), uint_vector_temp)); - - // Prove to the AMD compiler that 24*24 multiplication can be done. 16 bits - // are more than enough for coordinates even with 3x resolution scaling (and - // Direct3D 11 hardware has 16.8 fixed-point coordinates). - // TODO(Triang3l): OpUnreachable if the coordinates have upper bits set. - - // Split the destination pixel coordinate into scalars. spv::Id dest_pixel_x = builder.createCompositeExtract(dest_pixel_coord, type_uint, 0); + spv::Id const_dest_tile_width_pixels = builder.makeUintConstant( + tile_width_samples >> + (uint32_t(dest_is_64bpp) + + uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X))); + spv::Id dest_tile_index_x = builder.createBinOp( + spv::OpUDiv, type_uint, dest_pixel_x, const_dest_tile_width_pixels); + spv::Id dest_tile_pixel_x = builder.createBinOp( + spv::OpUMod, type_uint, dest_pixel_x, const_dest_tile_width_pixels); spv::Id dest_pixel_y = builder.createCompositeExtract(dest_pixel_coord, type_uint, 1); - - // Split the destination pixel index into 32bpp tile and 32bpp-tile-relative - // pixel index. - uint32_t dest_sample_width_log2 = - uint32_t(dest_is_64bpp) + - uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X); - uint32_t dest_sample_height_log2 = - uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X); - uint32_t dest_tile_width_divide_scale, dest_tile_width_divide_shift; - draw_util::GetEdramTileWidthDivideScaleAndUpperShift( - draw_resolution_scale_x(), dest_tile_width_divide_scale, - dest_tile_width_divide_shift); - // Doing 16*16=32 multiplication, not 32*32=64. - // TODO(Triang3l): Abstract this away, don't do 32*32 on Direct3D 12 too. - dest_tile_width_divide_scale &= UINT16_MAX; - dest_tile_width_divide_shift += 16; - // Need the host tile size in pixels, not samples. - dest_tile_width_divide_shift -= dest_sample_width_log2; - spv::Id dest_tile_index_x = builder.createBinOp( - spv::OpShiftRightLogical, type_uint, - builder.createBinOp( - spv::OpIMul, type_uint, dest_pixel_x, - builder.makeUintConstant(dest_tile_width_divide_scale)), - builder.makeUintConstant(dest_tile_width_divide_shift)); - spv::Id dest_tile_pixel_x = builder.createBinOp( - spv::OpISub, type_uint, dest_pixel_x, - builder.createBinOp(spv::OpIMul, type_uint, dest_tile_index_x, - builder.makeUintConstant(tile_width_samples_scaled >> - dest_sample_width_log2))); - spv::Id dest_tile_index_y, dest_tile_pixel_y; - static_assert( - TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, - "VulkanRenderTargetCache EDRAM range ownership transfer shader " - "generation supports Y draw resolution scaling factors of only up to 3"); - if (draw_resolution_scale_y() == 3) { - dest_tile_index_y = builder.createBinOp( - spv::OpShiftRightLogical, type_uint, - builder.createBinOp( - spv::OpIMul, type_uint, dest_pixel_y, - builder.makeUintConstant(draw_util::kDivideScale3 & UINT16_MAX)), - builder.makeUintConstant(draw_util::kDivideUpperShift3 + 16 + 4 - - dest_sample_height_log2)); - dest_tile_pixel_y = builder.createBinOp( - spv::OpISub, type_uint, dest_pixel_y, - builder.createBinOp( - spv::OpIMul, type_uint, dest_tile_index_y, - builder.makeUintConstant(tile_height_samples_scaled >> - dest_sample_height_log2))); - } else { - assert_true(draw_resolution_scale_y() <= 2); - uint32_t dest_tile_height_pixels_log2 = - (draw_resolution_scale_y() == 2 ? 5 : 4) - dest_sample_height_log2; - dest_tile_index_y = builder.createBinOp( - spv::OpShiftRightLogical, type_uint, dest_pixel_y, - builder.makeUintConstant(dest_tile_height_pixels_log2)); - dest_tile_pixel_y = builder.createBinOp( - spv::OpBitwiseAnd, type_uint, dest_pixel_y, - builder.makeUintConstant((uint32_t(1) << dest_tile_height_pixels_log2) - - 1)); - } + spv::Id const_dest_tile_height_pixels = builder.makeUintConstant( + tile_height_samples >> + uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X)); + spv::Id dest_tile_index_y = builder.createBinOp( + spv::OpUDiv, type_uint, dest_pixel_y, const_dest_tile_height_pixels); + spv::Id dest_tile_pixel_y = builder.createBinOp( + spv::OpUMod, type_uint, dest_pixel_y, const_dest_tile_height_pixels); assert_true(push_constants_member_address != UINT32_MAX); id_vector_temp.clear(); @@ -2269,7 +2224,7 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( // Copying between color and depth / stencil - swap 40-32bpp-sample columns // in the pixel index within the source 32bpp tile. uint32_t source_32bpp_tile_half_pixels = - tile_width_samples_scaled >> (1 + source_pixel_width_dwords_log2); + tile_width_samples >> (1 + source_pixel_width_dwords_log2); source_tile_pixel_x = builder.createUnaryOp( spv::OpBitcast, type_uint, builder.createBinOp( @@ -2315,7 +2270,7 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( spv::OpIAdd, type_uint, builder.createBinOp( spv::OpIMul, type_uint, - builder.makeUintConstant(tile_width_samples_scaled >> + builder.makeUintConstant(tile_width_samples >> source_pixel_width_dwords_log2), source_tile_index_x), source_tile_pixel_x)); @@ -2326,7 +2281,7 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( builder.createBinOp( spv::OpIMul, type_uint, builder.makeUintConstant( - tile_height_samples_scaled >> + tile_height_samples >> uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k2X)), source_tile_index_y), source_tile_pixel_y)); @@ -2688,8 +2643,8 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( switch (source_depth_format) { case xenos::DepthRenderTargetFormat::kD24S8: { // Round to the nearest even integer. This seems to be the - // correct, adding +0.5 and rounding towards zero results in red - // instead of black in the 4D5307E6 clear shader. + // correct conversion, adding +0.5 and rounding towards zero results + // in red instead of black in the 4D5307E6 clear shader. id_vector_temp.clear(); id_vector_temp.push_back(builder.createBinOp( spv::OpFMul, type_float, source_depth_float[i], @@ -3003,9 +2958,9 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( } else { switch (source_depth_format) { case xenos::DepthRenderTargetFormat::kD24S8: { - // Round to the nearest even integer. This seems to be the correct, - // adding +0.5 and rounding towards zero results in red instead of - // black in the 4D5307E6 clear shader. + // Round to the nearest even integer. This seems to be the correct + // conversion, adding +0.5 and rounding towards zero results in red + // instead of black in the 4D5307E6 clear shader. id_vector_temp.clear(); id_vector_temp.push_back(builder.createBinOp( spv::OpFMul, type_float, source_depth_float[0], @@ -3384,7 +3339,7 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( spv::OpIAdd, type_uint, builder.createBinOp(spv::OpIMul, type_uint, builder.makeUintConstant( - tile_width_samples_scaled >> + tile_width_samples >> uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k4X)), host_depth_source_tile_index_x), @@ -3395,7 +3350,7 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( spv::OpIAdd, type_uint, builder.createBinOp(spv::OpIMul, type_uint, builder.makeUintConstant( - tile_height_samples_scaled >> + tile_height_samples >> uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k2X)), host_depth_source_tile_index_y), @@ -3469,14 +3424,14 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( spv::OpIAdd, type_uint, builder.createBinOp( spv::OpIMul, type_uint, - builder.makeUintConstant(tile_width_samples_scaled * - tile_height_samples_scaled), + builder.makeUintConstant(tile_width_samples * + tile_height_samples), dest_tile_index), builder.createBinOp( spv::OpIAdd, type_uint, builder.createBinOp( spv::OpIMul, type_uint, - builder.makeUintConstant(tile_width_samples_scaled), + builder.makeUintConstant(tile_width_samples), dest_tile_sample_y), dest_tile_sample_x)); id_vector_temp.clear(); @@ -3505,8 +3460,8 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( switch (dest_depth_format) { case xenos::DepthRenderTargetFormat::kD24S8: { // Round to the nearest even integer. This seems to be the - // correct, adding +0.5 and rounding towards zero results in red - // instead of black in the 4D5307E6 clear shader. + // correct conversion, adding +0.5 and rounding towards zero + // results in red instead of black in the 4D5307E6 clear shader. id_vector_temp.clear(); id_vector_temp.push_back(builder.createBinOp( spv::OpFMul, type_float, host_depth32,