diff --git a/src/xenia/gpu/primitive_processor.cc b/src/xenia/gpu/primitive_processor.cc index a41b8a432..68da6d100 100644 --- a/src/xenia/gpu/primitive_processor.cc +++ b/src/xenia/gpu/primitive_processor.cc @@ -691,6 +691,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { // Writing to the trace irrespective of the cache lookup result // because cache behavior depends on runtime configuration and // state. + // Example of 16-bit reset index replacement: 415607D4. trace_writer_.WriteMemoryRead(guest_index_base, guest_index_buffer_needed_bytes); // Not specifying the primitive type in the cache key because not diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index 2d574bc47..f7dc2c1f3 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -31,6 +31,7 @@ SpirvShaderTranslator::Features::Features(bool all) max_storage_buffer_range(all ? UINT32_MAX : (128 * 1024 * 1024)), clip_distance(all), cull_distance(all), + full_draw_index_uint32(all), image_view_format_swizzle(all), signed_zero_inf_nan_preserve_float32(all), denorm_flush_to_zero_float32(all) {} @@ -40,7 +41,8 @@ SpirvShaderTranslator::Features::Features( : max_storage_buffer_range( provider.device_properties().limits.maxStorageBufferRange), clip_distance(provider.device_features().shaderClipDistance), - cull_distance(provider.device_features().shaderCullDistance) { + cull_distance(provider.device_features().shaderCullDistance), + full_draw_index_uint32(provider.device_features().fullDrawIndexUint32) { uint32_t device_version = provider.device_properties().apiVersion; const ui::vulkan::VulkanProvider::DeviceExtensions& device_extensions = provider.device_extensions(); @@ -221,6 +223,8 @@ void SpirvShaderTranslator::StartTranslation() { sizeof(uint32_t) * 4); const SystemConstant system_constants[] = { {"flags", offsetof(SystemConstants, flags), type_uint_}, + {"vertex_index_load_address", + offsetof(SystemConstants, vertex_index_load_address), type_uint_}, {"vertex_index_endian", offsetof(SystemConstants, vertex_index_endian), type_uint_}, {"vertex_base_index", offsetof(SystemConstants, vertex_base_index), @@ -1129,18 +1133,73 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { if (register_count()) { // TODO(Triang3l): Barycentric coordinates and patch index. if (IsSpirvVertexShader()) { - // TODO(Triang3l): Fetch the vertex index from the shared memory when - // fullDrawIndexUint32 isn't available and the index is 32-bit and needs - // endian swap. // TODO(Triang3l): Close line loop primitive. - // Load the unswapped index as uint for swapping. + // Load the unswapped index as uint for swapping, or for indirect loading + // if needed. spv::Id vertex_index = builder_->createUnaryOp( spv::OpBitcast, type_uint_, builder_->createLoad(input_vertex_index_, spv::NoPrecision)); + if (!features_.full_draw_index_uint32) { + // Check if the full 32-bit index needs to be loaded indirectly. + spv::Id load_vertex_index = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant( + static_cast(kSysFlag_VertexIndexLoad))), + const_uint_0_); + spv::Block& block_load_vertex_index_pre = *builder_->getBuildPoint(); + spv::Block& block_load_vertex_index_start = builder_->makeNewBlock(); + spv::Block& block_load_vertex_index_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_load_vertex_index_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(load_vertex_index, + &block_load_vertex_index_start, + &block_load_vertex_index_merge); + builder_->setBuildPoint(&block_load_vertex_index_start); + // Load the 32-bit index. + // TODO(Triang3l): Bounds checking. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantVertexIndexLoadAddress)); + spv::Id loaded_vertex_index = + LoadUint32FromSharedMemory(builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, + builder_->createLoad( + builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision), + builder_->makeUintConstant(2)), + vertex_index))); + // Get the actual build point for phi. + spv::Block& block_load_vertex_index_end = *builder_->getBuildPoint(); + builder_->createBranch(&block_load_vertex_index_merge); + // Select between the loaded index and the original index from Vulkan. + builder_->setBuildPoint(&block_load_vertex_index_merge); + { + std::unique_ptr loaded_vertex_index_phi_op = + std::make_unique(builder_->getUniqueId(), + type_uint_, spv::OpPhi); + loaded_vertex_index_phi_op->addIdOperand(loaded_vertex_index); + loaded_vertex_index_phi_op->addIdOperand( + block_load_vertex_index_end.getId()); + loaded_vertex_index_phi_op->addIdOperand(vertex_index); + loaded_vertex_index_phi_op->addIdOperand( + block_load_vertex_index_pre.getId()); + vertex_index = loaded_vertex_index_phi_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(loaded_vertex_index_phi_op)); + } + } // Endian-swap the index and convert to int. id_vector_temp_.clear(); id_vector_temp_.push_back( - builder_->makeIntConstant(kSystemConstantIndexVertexIndexEndian)); + builder_->makeIntConstant(kSystemConstantVertexIndexEndian)); spv::Id vertex_index_endian = builder_->createLoad(builder_->createAccessChain( spv::StorageClassUniform, @@ -1152,7 +1211,7 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { // Add the base to the index. id_vector_temp_.clear(); id_vector_temp_.push_back( - builder_->makeIntConstant(kSystemConstantIndexVertexBaseIndex)); + builder_->makeIntConstant(kSystemConstantVertexBaseIndex)); vertex_index = builder_->createBinOp( spv::OpIAdd, type_int_, vertex_index, builder_->createLoad(builder_->createAccessChain( diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 1f26d0887..69d05d95c 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -83,6 +83,9 @@ class SpirvShaderTranslator : public ShaderTranslator { }; enum : uint32_t { + kSysFlag_VertexIndexLoad_Shift, + kSysFlag_ComputeOrPrimitiveVertexIndexLoad_Shift, + kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit_Shift, kSysFlag_XYDividedByW_Shift, kSysFlag_ZDividedByW_Shift, kSysFlag_WNotReciprocal_Shift, @@ -98,6 +101,22 @@ class SpirvShaderTranslator : public ShaderTranslator { kSysFlag_Count, + // For HostVertexShaderType kVertex, if fullDrawIndexUint32 is not + // supported (ignored otherwise), whether to fetch the index manually + // (32-bit only - 16-bit indices are always fetched via the Vulkan index + // buffer). + kSysFlag_VertexIndexLoad = 1u << kSysFlag_VertexIndexLoad_Shift, + // For HostVertexShaderTypes kMemexportCompute, kPointListAsTriangleStrip, + // kRectangleListAsTriangleStrip, whether the vertex index needs to be + // loaded from the index buffer (rather than using autogenerated indices), + // and whether it's 32-bit. This is separate from kSysFlag_VertexIndexLoad + // because the same system constants may be used for the memexporting + // compute shader and the vertex shader for the same draw, but + // kSysFlag_VertexIndexLoad may be not needed. + kSysFlag_ComputeOrPrimitiveVertexIndexLoad = + 1u << kSysFlag_ComputeOrPrimitiveVertexIndexLoad_Shift, + kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit = + 1u << kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit_Shift, kSysFlag_XYDividedByW = 1u << kSysFlag_XYDividedByW_Shift, kSysFlag_ZDividedByW = 1u << kSysFlag_ZDividedByW_Shift, kSysFlag_WNotReciprocal = 1u << kSysFlag_WNotReciprocal_Shift, @@ -116,11 +135,14 @@ class SpirvShaderTranslator : public ShaderTranslator { // IF SYSTEM CONSTANTS ARE CHANGED OR ADDED, THE FOLLOWING MUST BE UPDATED: // - SystemConstantIndex enum. // - Structure members in BeginTranslation. + // + // Using the std140 layout - vec2 must be aligned to 8 bytes, vec3 and vec4 to + // 16 bytes. struct SystemConstants { uint32_t flags; + uint32_t vertex_index_load_address; xenos::Endian vertex_index_endian; int32_t vertex_base_index; - uint32_t padding_vertex_base_index; float ndc_scale[3]; uint32_t padding_ndc_scale; @@ -216,6 +238,7 @@ class SpirvShaderTranslator : public ShaderTranslator { uint32_t max_storage_buffer_range; bool clip_distance; bool cull_distance; + bool full_draw_index_uint32; bool image_view_format_swizzle; bool signed_zero_inf_nan_preserve_float32; bool denorm_flush_to_zero_float32; @@ -576,8 +599,9 @@ class SpirvShaderTranslator : public ShaderTranslator { enum SystemConstantIndex : unsigned int { kSystemConstantFlags, - kSystemConstantIndexVertexIndexEndian, - kSystemConstantIndexVertexBaseIndex, + kSystemConstantVertexIndexLoadAddress, + kSystemConstantVertexIndexEndian, + kSystemConstantVertexBaseIndex, kSystemConstantNdcScale, kSystemConstantNdcOffset, kSystemConstantTextureSwizzledSigns, diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 4ad3642e0..3c4422561 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -2383,9 +2383,10 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, normalized_depth_control); // Update system constants before uploading them. - UpdateSystemConstantValues(primitive_polygonal, - primitive_processing_result.host_index_endian, - viewport_info, used_texture_mask); + bool vertex_shader_index_load; + UpdateSystemConstantValues(primitive_polygonal, primitive_processing_result, + viewport_info, used_texture_mask, + vertex_shader_index_load); // Update uniform buffers and descriptor sets after binding the pipeline with // the new layout. @@ -2451,7 +2452,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, // Draw. if (primitive_processing_result.index_buffer_type == - PrimitiveProcessor::ProcessedIndexBufferType::kNone) { + PrimitiveProcessor::ProcessedIndexBufferType::kNone || + vertex_shader_index_load) { deferred_command_buffer_.CmdVkDraw( primitive_processing_result.host_draw_vertex_count, 1, 0, 0); } else { @@ -3338,8 +3340,10 @@ void VulkanCommandProcessor::UpdateDynamicState( } void VulkanCommandProcessor::UpdateSystemConstantValues( - bool primitive_polygonal, xenos::Endian index_endian, - const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask) { + bool primitive_polygonal, + const PrimitiveProcessor::ProcessingResult& primitive_processing_result, + const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask, + bool& vertex_shader_index_load_out) { #if XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES @@ -3362,6 +3366,52 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( // Flags. uint32_t flags = 0; + // Vertex index shader loading. + bool vertex_shader_index_load = false; + // Only for ProcessedIndexBufferType kGuest since kHostConverted indices may + // be not loaded into the GPU memory (only read on the CPU), though + // kHostConverted must never be used for point lists and rectangle lists + // without geometry shaders anyway. For regular 32-bit index fetching without + // fullDrawIndexUint32, kHostConverted indices are already byte-swapped and + // truncated to 24 bits, so indirect fetch is not needed. + if (primitive_processing_result.index_buffer_type == + PrimitiveProcessor::ProcessedIndexBufferType::kGuest) { + switch (primitive_processing_result.host_vertex_shader_type) { + case Shader::HostVertexShaderType::kVertex: { + // For guest (usually big-endian) 32-bit indices when they're not + // supported by the device. + if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) { + const ui::vulkan::VulkanProvider& provider = GetVulkanProvider(); + const VkPhysicalDeviceFeatures& device_features = + provider.device_features(); + if (!device_features.fullDrawIndexUint32) { + vertex_shader_index_load = true; + flags |= SpirvShaderTranslator::kSysFlag_VertexIndexLoad; + } + } + } break; + // kMemexportCompute never comes out of the PrimitiveProcessor, as + // memexport compute shaders are executed alongside their vertex + // counterparts, since they may still result in drawing. + case Shader::HostVertexShaderType::kPointListAsTriangleStrip: + case Shader::HostVertexShaderType::kRectangleListAsTriangleStrip: { + // Always loading the guest index buffer indirectly if it's used, as + // host indexing contains a part needed specifically for the host for + // the construction of the primitive - host vertices don't map 1:1 to + // guest ones. + vertex_shader_index_load = true; + flags |= + SpirvShaderTranslator::kSysFlag_ComputeOrPrimitiveVertexIndexLoad; + if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) { + flags |= SpirvShaderTranslator :: + kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit; + } + } break; + default: + break; + } + } + vertex_shader_index_load_out = vertex_shader_index_load; // W0 division control. // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf // 8: VTX_XY_FMT = true: the incoming XY have already been multiplied by 1/W0. @@ -3404,9 +3454,21 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( dirty |= system_constants_.flags != flags; system_constants_.flags = flags; + // Index buffer address for loading in the shaders. + if (flags & + (SpirvShaderTranslator::kSysFlag_VertexIndexLoad | + SpirvShaderTranslator::kSysFlag_ComputeOrPrimitiveVertexIndexLoad)) { + dirty |= system_constants_.vertex_index_load_address != + primitive_processing_result.guest_index_base; + system_constants_.vertex_index_load_address = + primitive_processing_result.guest_index_base; + } + // Index or tessellation edge factor buffer endianness. - dirty |= system_constants_.vertex_index_endian != index_endian; - system_constants_.vertex_index_endian = index_endian; + dirty |= system_constants_.vertex_index_endian != + primitive_processing_result.host_index_endian; + system_constants_.vertex_index_endian = + primitive_processing_result.host_index_endian; // Vertex index offset. dirty |= system_constants_.vertex_base_index != vgt_indx_offset; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index 215fa3ee9..f500e0718 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -433,10 +433,11 @@ class VulkanCommandProcessor : public CommandProcessor { void UpdateDynamicState(const draw_util::ViewportInfo& viewport_info, bool primitive_polygonal, reg::RB_DEPTHCONTROL normalized_depth_control); - void UpdateSystemConstantValues(bool primitive_polygonal, - xenos::Endian index_endian, - const draw_util::ViewportInfo& viewport_info, - uint32_t used_texture_mask); + void UpdateSystemConstantValues( + bool primitive_polygonal, + const PrimitiveProcessor::ProcessingResult& primitive_processing_result, + const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask, + bool& vertex_shader_index_load_out); bool UpdateBindings(const VulkanShader* vertex_shader, const VulkanShader* pixel_shader); // Allocates a descriptor set and fills one or two VkWriteDescriptorSet diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 644874ffd..55a16df1d 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -208,6 +208,7 @@ enum class Endian128 : uint32_t { enum class IndexFormat : uint32_t { kInt16, + // Not very common, but used for some world draws in 545407E0. kInt32, };