From 3d30b2eec3ab1f83140b09745bee881fb5d5dde2 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sat, 25 May 2024 16:00:21 +0300 Subject: [PATCH] [Vulkan] Shader memory export (#145) --- src/xenia/gpu/spirv_builder.cc | 90 ++ src/xenia/gpu/spirv_builder.h | 47 + src/xenia/gpu/spirv_shader_translator.cc | 351 +++++-- src/xenia/gpu/spirv_shader_translator.h | 73 +- src/xenia/gpu/spirv_shader_translator_alu.cc | 51 +- .../gpu/spirv_shader_translator_memexport.cc | 950 ++++++++++++++++++ .../gpu/vulkan/vulkan_command_processor.cc | 64 +- .../gpu/vulkan/vulkan_command_processor.h | 3 + 8 files changed, 1535 insertions(+), 94 deletions(-) create mode 100644 src/xenia/gpu/spirv_shader_translator_memexport.cc diff --git a/src/xenia/gpu/spirv_builder.cc b/src/xenia/gpu/spirv_builder.cc index 2ed78bd65..fc2e92850 100644 --- a/src/xenia/gpu/spirv_builder.cc +++ b/src/xenia/gpu/spirv_builder.cc @@ -203,5 +203,95 @@ spv::Id SpirvBuilder::IfBuilder::createMergePhi(spv::Id then_variable, getElsePhiParent()); } +SpirvBuilder::SwitchBuilder::SwitchBuilder(spv::Id selector, + unsigned int selection_control, + SpirvBuilder& builder) + : builder_(builder), + selector_(selector), + selection_control_(selection_control), + function_(builder.getBuildPoint()->getParent()), + header_block_(builder.getBuildPoint()), + default_phi_parent_(builder.getBuildPoint()->getId()) { + merge_block_ = new spv::Block(builder_.getUniqueId(), function_); +} + +void SpirvBuilder::SwitchBuilder::makeBeginDefault() { + assert_null(default_block_); + + endSegment(); + + default_block_ = new spv::Block(builder_.getUniqueId(), function_); + function_.addBlock(default_block_); + default_block_->addPredecessor(header_block_); + builder_.setBuildPoint(default_block_); + + current_branch_ = Branch::kDefault; +} + +void SpirvBuilder::SwitchBuilder::makeBeginCase(unsigned int literal) { + endSegment(); + + auto case_block = new spv::Block(builder_.getUniqueId(), function_); + function_.addBlock(case_block); + cases_.emplace_back(literal, case_block->getId()); + case_block->addPredecessor(header_block_); + builder_.setBuildPoint(case_block); + + current_branch_ = Branch::kCase; +} + +void SpirvBuilder::SwitchBuilder::addCurrentCaseLiteral(unsigned int literal) { + assert_true(current_branch_ == Branch::kCase); + + cases_.emplace_back(literal, cases_.back().second); +} + +void SpirvBuilder::SwitchBuilder::makeEndSwitch() { + endSegment(); + + builder_.setBuildPoint(header_block_); + + builder_.createSelectionMerge(merge_block_, selection_control_); + + std::unique_ptr switch_instruction = + std::make_unique(spv::OpSwitch); + switch_instruction->addIdOperand(selector_); + if (default_block_) { + switch_instruction->addIdOperand(default_block_->getId()); + } else { + switch_instruction->addIdOperand(merge_block_->getId()); + merge_block_->addPredecessor(header_block_); + } + for (const std::pair& case_pair : cases_) { + switch_instruction->addImmediateOperand(case_pair.first); + switch_instruction->addIdOperand(case_pair.second); + } + builder_.getBuildPoint()->addInstruction(std::move(switch_instruction)); + + function_.addBlock(merge_block_); + builder_.setBuildPoint(merge_block_); + + current_branch_ = Branch::kMerge; +} + +void SpirvBuilder::SwitchBuilder::endSegment() { + assert_true(current_branch_ == Branch::kSelection || + current_branch_ == Branch::kDefault || + current_branch_ == Branch::kCase); + + if (current_branch_ == Branch::kSelection) { + return; + } + + if (!builder_.getBuildPoint()->isTerminated()) { + builder_.createBranch(merge_block_); + if (current_branch_ == Branch::kDefault) { + default_phi_parent_ = builder_.getBuildPoint()->getId(); + } + } + + current_branch_ = Branch::kSelection; +} + } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/spirv_builder.h b/src/xenia/gpu/spirv_builder.h index 1bb2e6851..7422d7c63 100644 --- a/src/xenia/gpu/spirv_builder.h +++ b/src/xenia/gpu/spirv_builder.h @@ -10,7 +10,10 @@ #ifndef XENIA_GPU_SPIRV_BUILDER_H_ #define XENIA_GPU_SPIRV_BUILDER_H_ +#include #include +#include +#include #include "third_party/glslang/SPIRV/SpvBuilder.h" #include "xenia/base/assert.h" @@ -99,6 +102,50 @@ class SpirvBuilder : public spv::Builder { Branch currentBranch = Branch::kThen; #endif }; + + // Simpler and more flexible (such as multiple cases pointing to the same + // block) compared to makeSwitch. + class SwitchBuilder { + public: + SwitchBuilder(spv::Id selector, unsigned int selection_control, + SpirvBuilder& builder); + ~SwitchBuilder() { assert_true(current_branch_ == Branch::kMerge); } + + void makeBeginDefault(); + void makeBeginCase(unsigned int literal); + void addCurrentCaseLiteral(unsigned int literal); + void makeEndSwitch(); + + // If there's no default block that branches to the merge block, the phi + // parent is the header block - this simplifies case-only usage. + spv::Id getDefaultPhiParent() const { return default_phi_parent_; } + + private: + enum class Branch { + kSelection, + kDefault, + kCase, + kMerge, + }; + + void endSegment(); + + SpirvBuilder& builder_; + spv::Id selector_; + unsigned int selection_control_; + + spv::Function& function_; + + spv::Block* header_block_; + spv::Block* merge_block_; + spv::Block* default_block_ = nullptr; + + std::vector> cases_; + + spv::Id default_phi_parent_; + + Branch current_branch_ = Branch::kSelection; + }; }; } // namespace gpu diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index e34193219..399b7079f 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -30,30 +30,35 @@ namespace gpu { SpirvShaderTranslator::Features::Features(bool all) : spirv_version(all ? spv::Spv_1_5 : spv::Spv_1_0), max_storage_buffer_range(all ? UINT32_MAX : (128 * 1024 * 1024)), + full_draw_index_uint32(all), + vertex_pipeline_stores_and_atomics(all), + fragment_stores_and_atomics(all), clip_distance(all), cull_distance(all), - demote_to_helper_invocation(all), - fragment_shader_sample_interlock(all), - full_draw_index_uint32(all), image_view_format_swizzle(all), signed_zero_inf_nan_preserve_float32(all), denorm_flush_to_zero_float32(all), - rounding_mode_rte_float32(all) {} + rounding_mode_rte_float32(all), + fragment_shader_sample_interlock(all), + demote_to_helper_invocation(all) {} SpirvShaderTranslator::Features::Features( const ui::vulkan::VulkanProvider::DeviceInfo& device_info) : max_storage_buffer_range(device_info.maxStorageBufferRange), + full_draw_index_uint32(device_info.fullDrawIndexUint32), + vertex_pipeline_stores_and_atomics( + device_info.vertexPipelineStoresAndAtomics), + fragment_stores_and_atomics(device_info.fragmentStoresAndAtomics), clip_distance(device_info.shaderClipDistance), cull_distance(device_info.shaderCullDistance), - demote_to_helper_invocation(device_info.shaderDemoteToHelperInvocation), - fragment_shader_sample_interlock( - device_info.fragmentShaderSampleInterlock), - full_draw_index_uint32(device_info.fullDrawIndexUint32), image_view_format_swizzle(device_info.imageViewFormatSwizzle), signed_zero_inf_nan_preserve_float32( device_info.shaderSignedZeroInfNanPreserveFloat32), denorm_flush_to_zero_float32(device_info.shaderDenormFlushToZeroFloat32), - rounding_mode_rte_float32(device_info.shaderRoundingModeRTEFloat32) { + rounding_mode_rte_float32(device_info.shaderRoundingModeRTEFloat32), + fragment_shader_sample_interlock( + device_info.fragmentShaderSampleInterlock), + demote_to_helper_invocation(device_info.shaderDemoteToHelperInvocation) { if (device_info.apiVersion >= VK_MAKE_API_VERSION(0, 1, 2, 0)) { spirv_version = spv::Spv_1_5; } else if (device_info.ext_1_2_VK_KHR_spirv_1_4) { @@ -117,6 +122,14 @@ void SpirvShaderTranslator::Reset() { main_interface_.clear(); var_main_registers_ = spv::NoResult; + var_main_memexport_address_ = spv::NoResult; + for (size_t memexport_eM_index = 0; + memexport_eM_index < xe::countof(var_main_memexport_data_); + ++memexport_eM_index) { + var_main_memexport_data_[memexport_eM_index] = spv::NoResult; + } + var_main_memexport_data_written_ = spv::NoResult; + main_memexport_allowed_ = spv::NoResult; var_main_point_size_edge_flag_kill_vertex_ = spv::NoResult; var_main_kill_pixel_ = spv::NoResult; var_main_fsi_color_written_ = spv::NoResult; @@ -310,6 +323,8 @@ void SpirvShaderTranslator::StartTranslation() { main_interface_.push_back(uniform_system_constants_); } + bool memexport_used = IsMemoryExportUsed(); + if (!is_depth_only_fragment_shader_) { // Common uniform buffer - float constants. uint32_t float_constant_count = @@ -420,9 +435,10 @@ void SpirvShaderTranslator::StartTranslation() { builder_->addMemberName(type_shared_memory, 0, "shared_memory"); builder_->addMemberDecoration(type_shared_memory, 0, spv::DecorationRestrict); - // TODO(Triang3l): Make writable when memexport is implemented. - builder_->addMemberDecoration(type_shared_memory, 0, - spv::DecorationNonWritable); + if (!memexport_used) { + builder_->addMemberDecoration(type_shared_memory, 0, + spv::DecorationNonWritable); + } builder_->addMemberDecoration(type_shared_memory, 0, spv::DecorationOffset, 0); builder_->addDecoration(type_shared_memory, @@ -509,6 +525,24 @@ void SpirvShaderTranslator::StartTranslation() { builder_->createVariable(spv::NoPrecision, spv::StorageClassFunction, type_register_array, "xe_var_registers"); } + if (memexport_used) { + var_main_memexport_address_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float4_, + "xe_var_memexport_address", const_float4_0_); + uint8_t memexport_eM_remaining = current_shader().memexport_eM_written(); + uint32_t memexport_eM_index; + while ( + xe::bit_scan_forward(memexport_eM_remaining, &memexport_eM_index)) { + memexport_eM_remaining &= ~(uint8_t(1) << memexport_eM_index); + var_main_memexport_data_[memexport_eM_index] = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float4_, + fmt::format("xe_var_memexport_data_{}", memexport_eM_index).c_str(), + const_float4_0_); + } + var_main_memexport_data_written_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_uint_, + "xe_var_memexport_data_written", const_uint_0_); + } } // Write the execution model-specific prologue with access to variables in the @@ -647,6 +681,10 @@ std::vector SpirvShaderTranslator::CompleteTranslation() { builder_->setBuildPoint(main_loop_merge_); } + // Write data for the last memexport. + ExportToMemory( + current_shader().memexport_eM_potentially_written_before_end()); + if (is_vertex_shader()) { CompleteVertexOrTessEvalShaderInMain(); } else if (is_pixel_shader()) { @@ -1077,6 +1115,34 @@ void SpirvShaderTranslator::ProcessJumpInstruction( builder_->createBranch(main_loop_continue_); } +void SpirvShaderTranslator::ProcessAllocInstruction( + const ParsedAllocInstruction& instr, uint8_t export_eM) { + bool start_memexport = instr.type == ucode::AllocType::kMemory && + current_shader().memexport_eM_written(); + if (export_eM || start_memexport) { + CloseExecConditionals(); + } + + if (export_eM) { + ExportToMemory(export_eM); + // Reset which eM# elements have been written. + builder_->createStore(const_uint_0_, var_main_memexport_data_written_); + // Break dependencies from the previous memexport. + uint8_t export_eM_remaining = export_eM; + uint32_t eM_index; + while (xe::bit_scan_forward(export_eM_remaining, &eM_index)) { + export_eM_remaining &= ~(uint8_t(1) << eM_index); + builder_->createStore(const_float4_0_, + var_main_memexport_data_[eM_index]); + } + } + + if (start_memexport) { + // Initialize eA to an invalid address. + builder_->createStore(const_float4_0_, var_main_memexport_address_); + } +} + spv::Id SpirvShaderTranslator::SpirvSmearScalarResultOrConstant( spv::Id scalar, spv::Id vector_type) { bool is_constant = builder_->isConstant(scalar); @@ -1205,6 +1271,8 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderBeforeMain() { } void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { + Modification shader_modification = GetSpirvShaderModification(); + // The edge flag isn't used for any purpose by the translator. if (current_shader().writes_point_size_edge_flag_kill_vertex() & 0b101) { id_vector_temp_.clear(); @@ -1244,11 +1312,40 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { } } - Modification shader_modification = GetSpirvShaderModification(); - // TODO(Triang3l): For HostVertexShaderType::kRectangeListAsTriangleStrip, // start the vertex loop, and load the index there. + // Check if memory export should be allowed for this host vertex of the guest + // primitive to make sure export is done only once for each guest vertex. + if (IsMemoryExportUsed()) { + spv::Id memexport_allowed_for_host_vertex_of_guest_primitive = + spv::NoResult; + if (shader_modification.vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip) { + // Only for one host vertex for the point. + memexport_allowed_for_host_vertex_of_guest_primitive = + builder_->createBinOp( + spv::OpIEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, + builder_->createUnaryOp( + spv::OpBitcast, type_uint_, + builder_->createLoad(input_vertex_index_, + spv::NoPrecision)), + builder_->makeUintConstant(3)), + const_uint_0_); + } + + if (memexport_allowed_for_host_vertex_of_guest_primitive != spv::NoResult) { + main_memexport_allowed_ = + main_memexport_allowed_ != spv::NoResult + ? builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, main_memexport_allowed_, + memexport_allowed_for_host_vertex_of_guest_primitive) + : memexport_allowed_for_host_vertex_of_guest_primitive; + } + } + // Load the vertex index or the tessellation parameters. if (register_count()) { // TODO(Triang3l): Barycentric coordinates and patch index. @@ -1827,6 +1924,13 @@ void SpirvShaderTranslator::StartFragmentShaderBeforeMain() { } void SpirvShaderTranslator::StartFragmentShaderInMain() { + // TODO(Triang3l): Allow memory export with resolution scaling only for the + // center host pixel, with sample shading (for depth format conversion) only + // for the bottom-right sample (unlike in Direct3D, the sample mask input + // doesn't include covered samples of the primitive that correspond to other + // invocations, so use the sample that's the most friendly to the half-pixel + // offset). + // Set up pixel killing from within the translated shader without affecting // the control flow (unlike with OpKill), similarly to how pixel killing works // on the Xenos, and also keeping a single critical section exit and return @@ -2460,6 +2564,26 @@ void SpirvShaderTranslator::StoreResult(const InstructionResult& result, var_main_fsi_color_written_); } } break; + case InstructionStorageTarget::kExportAddress: { + // spv::NoResult if memory export usage is unsupported or invalid. + target_pointer = var_main_memexport_address_; + } break; + case InstructionStorageTarget::kExportData: { + // spv::NoResult if memory export usage is unsupported or invalid. + target_pointer = var_main_memexport_data_[result.storage_index]; + if (target_pointer != spv::NoResult) { + // Mark that the eM# has been written to and needs to be exported. + assert_true(var_main_memexport_data_written_ != spv::NoResult); + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createLoad(var_main_memexport_data_written_, + spv::NoPrecision), + builder_->makeUintConstant(uint32_t(1) + << result.storage_index)), + var_main_memexport_data_written_); + } + } break; default: // TODO(Triang3l): All storage targets. break; @@ -2814,16 +2938,59 @@ spv::Id SpirvShaderTranslator::EndianSwap32Uint(spv::Id value, spv::Id endian) { return value; } +spv::Id SpirvShaderTranslator::EndianSwap128Uint4(spv::Id value, + spv::Id endian) { + // Change 8-in-64 and 8-in-128 to 8-in-32, and then swap within 32 bits. + + spv::Id is_8in64 = builder_->createBinOp( + spv::OpIEqual, type_bool_, endian, + builder_->makeUintConstant( + static_cast(xenos::Endian128::k8in64))); + uint_vector_temp_.clear(); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(3); + uint_vector_temp_.push_back(2); + value = builder_->createTriOp( + spv::OpSelect, type_uint4_, is_8in64, + builder_->createRvalueSwizzle(spv::NoPrecision, type_uint4_, value, + uint_vector_temp_), + value); + + spv::Id is_8in128 = builder_->createBinOp( + spv::OpIEqual, type_bool_, endian, + builder_->makeUintConstant( + static_cast(xenos::Endian128::k8in128))); + uint_vector_temp_.clear(); + uint_vector_temp_.push_back(3); + uint_vector_temp_.push_back(2); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(0); + value = builder_->createTriOp( + spv::OpSelect, type_uint4_, is_8in128, + builder_->createRvalueSwizzle(spv::NoPrecision, type_uint4_, value, + uint_vector_temp_), + value); + + endian = builder_->createTriOp( + spv::OpSelect, type_uint_, + builder_->createBinOp(spv::OpLogicalOr, type_bool_, is_8in64, is_8in128), + builder_->makeUintConstant( + static_cast(xenos::Endian128::k8in32)), + endian); + + return EndianSwap32Uint(value, endian); +} + spv::Id SpirvShaderTranslator::LoadUint32FromSharedMemory( spv::Id address_dwords_int) { - spv::Block& head_block = *builder_->getBuildPoint(); - assert_false(head_block.isTerminated()); - spv::StorageClass storage_class = features_.spirv_version >= spv::Spv_1_3 ? spv::StorageClassStorageBuffer : spv::StorageClassUniform; - uint32_t buffer_count_log2 = GetSharedMemoryStorageBufferCountLog2(); - if (!buffer_count_log2) { + + uint32_t binding_count_log2 = GetSharedMemoryStorageBufferCountLog2(); + + if (!binding_count_log2) { // Single binding - load directly. id_vector_temp_.clear(); // The only SSBO struct member. @@ -2837,8 +3004,10 @@ spv::Id SpirvShaderTranslator::LoadUint32FromSharedMemory( // The memory is split into multiple bindings - check which binding to load // from. 29 is log2(512 MB), but addressing in dwords (4 B). Not indexing the - // array with the variable itself because it needs VK_EXT_descriptor_indexing. - uint32_t binding_address_bits = (29 - 2) - buffer_count_log2; + // array with the variable itself because it needs non-uniform storage buffer + // indexing. + + uint32_t binding_address_bits = (29 - 2) - binding_count_log2; spv::Id binding_index = builder_->createBinOp( spv::OpShiftRightLogical, type_uint_, builder_->createUnaryOp(spv::OpBitcast, type_uint_, address_dwords_int), @@ -2847,51 +3016,119 @@ spv::Id SpirvShaderTranslator::LoadUint32FromSharedMemory( spv::OpBitwiseAnd, type_int_, address_dwords_int, builder_->makeIntConstant( int((uint32_t(1) << binding_address_bits) - 1))); - uint32_t buffer_count = 1 << buffer_count_log2; - spv::Block* switch_case_blocks[512 / 128]; - for (uint32_t i = 0; i < buffer_count; ++i) { - switch_case_blocks[i] = &builder_->makeNewBlock(); - } - spv::Block& switch_merge_block = builder_->makeNewBlock(); - spv::Id value_phi_result = builder_->getUniqueId(); - std::unique_ptr value_phi_op = - std::make_unique(value_phi_result, type_uint_, - spv::OpPhi); - builder_->createSelectionMerge(&switch_merge_block, - spv::SelectionControlDontFlattenMask); - { - std::unique_ptr switch_op = - std::make_unique(spv::OpSwitch); - switch_op->addIdOperand(binding_index); - // Highest binding index is the default case. - switch_op->addIdOperand(switch_case_blocks[buffer_count - 1]->getId()); - switch_case_blocks[buffer_count - 1]->addPredecessor(&head_block); - for (uint32_t i = 0; i < buffer_count - 1; ++i) { - switch_op->addImmediateOperand(int(i)); - switch_op->addIdOperand(switch_case_blocks[i]->getId()); - switch_case_blocks[i]->addPredecessor(&head_block); - } - builder_->getBuildPoint()->addInstruction(std::move(switch_op)); - } - for (uint32_t i = 0; i < buffer_count; ++i) { - builder_->setBuildPoint(switch_case_blocks[i]); - id_vector_temp_.clear(); - id_vector_temp_.push_back(builder_->makeIntConstant(int(i))); - // The only SSBO struct member. - id_vector_temp_.push_back(const_int_0_); - id_vector_temp_.push_back(binding_address); + + auto value_phi_op = std::make_unique( + builder_->getUniqueId(), type_uint_, spv::OpPhi); + // Zero if out of bounds. + value_phi_op->addIdOperand(const_uint_0_); + value_phi_op->addIdOperand(builder_->getBuildPoint()->getId()); + + SpirvBuilder::SwitchBuilder binding_switch( + binding_index, spv::SelectionControlDontFlattenMask, *builder_); + uint32_t binding_count = uint32_t(1) << binding_count_log2; + + id_vector_temp_.clear(); + id_vector_temp_.push_back(spv::NoResult); + // The only SSBO struct member. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(binding_address); + + for (uint32_t i = 0; i < binding_count; ++i) { + binding_switch.makeBeginCase(i); + id_vector_temp_[0] = builder_->makeIntConstant(int(i)); value_phi_op->addIdOperand(builder_->createLoad( builder_->createAccessChain(storage_class, buffers_shared_memory_, id_vector_temp_), spv::NoPrecision)); - value_phi_op->addIdOperand(switch_case_blocks[i]->getId()); - builder_->createBranch(&switch_merge_block); + value_phi_op->addIdOperand(builder_->getBuildPoint()->getId()); } - builder_->setBuildPoint(&switch_merge_block); + + binding_switch.makeEndSwitch(); + + spv::Id value_phi_result = value_phi_op->getResultId(); builder_->getBuildPoint()->addInstruction(std::move(value_phi_op)); return value_phi_result; } +void SpirvShaderTranslator::StoreUint32ToSharedMemory( + spv::Id value, spv::Id address_dwords_int, spv::Id replace_mask) { + spv::StorageClass storage_class = features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform; + + spv::Id keep_mask = spv::NoResult; + if (replace_mask != spv::NoResult) { + keep_mask = builder_->createUnaryOp(spv::OpNot, type_uint_, replace_mask); + value = builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, value, + replace_mask); + } + + auto store = [&](spv::Id pointer) { + if (replace_mask != spv::NoResult) { + // Don't touch the other bits in the buffer, just modify the needed bits + // in the most up to date uint32 at the address. + spv::Id const_scope_device = builder_->makeUintConstant( + static_cast(spv::ScopeDevice)); + spv::Id const_semantics_relaxed = const_uint_0_; + builder_->createQuadOp(spv::OpAtomicAnd, type_uint_, pointer, + const_scope_device, const_semantics_relaxed, + keep_mask); + builder_->createQuadOp(spv::OpAtomicOr, type_uint_, pointer, + const_scope_device, const_semantics_relaxed, + value); + } else { + builder_->createStore(value, pointer); + } + }; + + uint32_t binding_count_log2 = GetSharedMemoryStorageBufferCountLog2(); + + if (!binding_count_log2) { + // Single binding - store directly. + id_vector_temp_.clear(); + // The only SSBO struct member. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(address_dwords_int); + store(builder_->createAccessChain(storage_class, buffers_shared_memory_, + id_vector_temp_)); + return; + } + + // The memory is split into multiple bindings - check which binding to store + // to. 29 is log2(512 MB), but addressing in dwords (4 B). Not indexing the + // array with the variable itself because it needs non-uniform storage buffer + // indexing. + + uint32_t binding_address_bits = (29 - 2) - binding_count_log2; + spv::Id binding_index = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, + builder_->createUnaryOp(spv::OpBitcast, type_uint_, address_dwords_int), + builder_->makeUintConstant(binding_address_bits)); + spv::Id binding_address = builder_->createBinOp( + spv::OpBitwiseAnd, type_int_, address_dwords_int, + builder_->makeIntConstant( + int((uint32_t(1) << binding_address_bits) - 1))); + + SpirvBuilder::SwitchBuilder binding_switch( + binding_index, spv::SelectionControlDontFlattenMask, *builder_); + uint32_t binding_count = uint32_t(1) << binding_count_log2; + + id_vector_temp_.clear(); + id_vector_temp_.push_back(spv::NoResult); + // The only SSBO struct member. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(binding_address); + + for (uint32_t i = 0; i < binding_count; ++i) { + binding_switch.makeBeginCase(i); + id_vector_temp_[0] = builder_->makeIntConstant(int(i)); + store(builder_->createAccessChain(storage_class, buffers_shared_memory_, + id_vector_temp_)); + } + + binding_switch.makeEndSwitch(); +} + spv::Id SpirvShaderTranslator::PWLGammaToLinear(spv::Id gamma, bool gamma_pre_saturated) { spv::Id value_type = builder_->getTypeId(gamma); diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 8c4942156..aefb00bf6 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -323,17 +323,28 @@ class SpirvShaderTranslator : public ShaderTranslator { explicit Features( const ui::vulkan::VulkanProvider::DeviceInfo& device_info); explicit Features(bool all = false); + unsigned int spirv_version; + uint32_t max_storage_buffer_range; + + bool full_draw_index_uint32; + + bool vertex_pipeline_stores_and_atomics; + bool fragment_stores_and_atomics; + bool clip_distance; bool cull_distance; - bool demote_to_helper_invocation; - bool fragment_shader_sample_interlock; - bool full_draw_index_uint32; + bool image_view_format_swizzle; + bool signed_zero_inf_nan_preserve_float32; bool denorm_flush_to_zero_float32; bool rounding_mode_rte_float32; + + bool fragment_shader_sample_interlock; + + bool demote_to_helper_invocation; }; SpirvShaderTranslator(const Features& features, @@ -424,6 +435,8 @@ class SpirvShaderTranslator : public ShaderTranslator { void ProcessLoopEndInstruction( const ParsedLoopEndInstruction& instr) override; void ProcessJumpInstruction(const ParsedJumpInstruction& instr) override; + void ProcessAllocInstruction(const ParsedAllocInstruction& instr, + uint8_t export_eM) override; void ProcessVertexFetchInstruction( const ParsedVertexFetchInstruction& instr) override; @@ -470,6 +483,11 @@ class SpirvShaderTranslator : public ShaderTranslator { Shader::IsHostVertexShaderTypeDomain( GetSpirvShaderModification().vertex.host_vertex_shader_type); } + bool IsSpirvComputeShader() const { + return is_vertex_shader() && + GetSpirvShaderModification().vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kMemExportCompute; + } bool IsExecutionModeEarlyFragmentTests() const { return is_pixel_shader() && @@ -567,24 +585,48 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id ZeroIfAnyOperandIsZero(spv::Id value, spv::Id operand_0_abs, spv::Id operand_1_abs); // Conditionally discard the current fragment. Changes the build point. - void KillPixel(spv::Id condition); + void KillPixel(spv::Id condition, + uint8_t memexport_eM_potentially_written_before); // Return type is a xe::bit_count(result.GetUsedResultComponents())-component // float vector or a single float, depending on whether it's a reduction // instruction (check getTypeId of the result), or returns spv::NoResult if // nothing to store. - spv::Id ProcessVectorAluOperation(const ParsedAluInstruction& instr, - bool& predicate_written); + spv::Id ProcessVectorAluOperation( + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before, bool& predicate_written); // Returns a float value to write to the previous scalar register and to the // destination. If the return value is ps itself (in the retain_prev case), // returns spv::NoResult (handled as a special case, so if it's retain_prev, // but don't need to write to anywhere, no OpLoad(ps) will be done). - spv::Id ProcessScalarAluOperation(const ParsedAluInstruction& instr, - bool& predicate_written); + spv::Id ProcessScalarAluOperation( + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before, bool& predicate_written); // Perform endian swap of a uint scalar or vector. spv::Id EndianSwap32Uint(spv::Id value, spv::Id endian); + // Perform endian swap of a uint4 vector. + spv::Id EndianSwap128Uint4(spv::Id value, spv::Id endian); spv::Id LoadUint32FromSharedMemory(spv::Id address_dwords_int); + // If `replace_mask` is provided, the bits specified in the mask will be + // replaced with those from the value via OpAtomicAnd/Or. + // Bits of `value` not in `replace_mask` will be ignored. + void StoreUint32ToSharedMemory(spv::Id value, spv::Id address_dwords_int, + spv::Id replace_mask = spv::NoResult); + + bool IsMemoryExportSupported() const { + if (is_pixel_shader()) { + return features_.fragment_stores_and_atomics; + } + return features_.vertex_pipeline_stores_and_atomics || + IsSpirvComputeShader(); + } + + bool IsMemoryExportUsed() const { + return current_shader().memexport_eM_written() && IsMemoryExportSupported(); + } + + void ExportToMemory(uint8_t export_eM); // The source may be a floating-point scalar or a vector. spv::Id PWLGammaToLinear(spv::Id gamma, bool gamma_pre_saturated); @@ -872,6 +914,21 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id var_main_tfetch_gradients_v_; // float4[register_count()]. spv::Id var_main_registers_; + // Memory export variables are created only when needed. + // float4. + spv::Id var_main_memexport_address_; + // Each is float4. + spv::Id var_main_memexport_data_[ucode::kMaxMemExportElementCount]; + // Bit field of which eM# elements have been written so far by the invocation + // since the last memory write - uint. + spv::Id var_main_memexport_data_written_; + // If memory export is disabled in certain invocations or (if emulating some + // primitive types without a geometry shader) at specific guest vertex loop + // iterations because the translated shader is executed multiple times for the + // same guest vertex or pixel, this contains whether memory export is allowed + // in the current execution of the translated code. + // bool. + spv::Id main_memexport_allowed_; // VS only - float3 (special exports). spv::Id var_main_point_size_edge_flag_kill_vertex_; // PS, only when needed - bool. diff --git a/src/xenia/gpu/spirv_shader_translator_alu.cc b/src/xenia/gpu/spirv_shader_translator_alu.cc index ecc88f57b..1e7580e34 100644 --- a/src/xenia/gpu/spirv_shader_translator_alu.cc +++ b/src/xenia/gpu/spirv_shader_translator_alu.cc @@ -39,10 +39,14 @@ spv::Id SpirvShaderTranslator::ZeroIfAnyOperandIsZero(spv::Id value, const_float_vectors_0_[num_components - 1], value); } -void SpirvShaderTranslator::KillPixel(spv::Id condition) { +void SpirvShaderTranslator::KillPixel( + spv::Id condition, uint8_t memexport_eM_potentially_written_before) { SpirvBuilder::IfBuilder kill_if(condition, spv::SelectionControlMaskNone, *builder_); { + // Perform outstanding memory exports before the invocation becomes inactive + // and storage writes are disabled. + ExportToMemory(memexport_eM_potentially_written_before); if (var_main_kill_pixel_ != spv::NoResult) { builder_->createStore(builder_->makeBoolConstant(true), var_main_kill_pixel_); @@ -77,12 +81,12 @@ void SpirvShaderTranslator::ProcessAluInstruction( // Whether the instruction has changed the predicate, and it needs to be // checked again later. bool predicate_written_vector = false; - spv::Id vector_result = - ProcessVectorAluOperation(instr, predicate_written_vector); + spv::Id vector_result = ProcessVectorAluOperation( + instr, memexport_eM_potentially_written_before, predicate_written_vector); bool predicate_written_scalar = false; - spv::Id scalar_result = - ProcessScalarAluOperation(instr, predicate_written_scalar); + spv::Id scalar_result = ProcessScalarAluOperation( + instr, memexport_eM_potentially_written_before, predicate_written_scalar); if (scalar_result != spv::NoResult) { EnsureBuildPointAvailable(); builder_->createStore(scalar_result, var_main_previous_scalar_); @@ -106,7 +110,8 @@ void SpirvShaderTranslator::ProcessAluInstruction( } spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( - const ParsedAluInstruction& instr, bool& predicate_written) { + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before, bool& predicate_written) { predicate_written = false; uint32_t used_result_components = @@ -769,14 +774,16 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( case ucode::AluVectorOpcode::kKillGt: case ucode::AluVectorOpcode::kKillGe: case ucode::AluVectorOpcode::kKillNe: { - KillPixel(builder_->createUnaryOp( - spv::OpAny, type_bool_, - builder_->createBinOp( - spv::Op(kOps[size_t(instr.vector_opcode)]), type_bool4_, - GetOperandComponents(operand_storage[0], instr.vector_operands[0], - 0b1111), - GetOperandComponents(operand_storage[1], instr.vector_operands[1], - 0b1111)))); + KillPixel( + builder_->createUnaryOp( + spv::OpAny, type_bool_, + builder_->createBinOp( + spv::Op(kOps[size_t(instr.vector_opcode)]), type_bool4_, + GetOperandComponents(operand_storage[0], + instr.vector_operands[0], 0b1111), + GetOperandComponents(operand_storage[1], + instr.vector_operands[1], 0b1111))), + memexport_eM_potentially_written_before); return const_float_0_; } @@ -862,7 +869,8 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( } spv::Id SpirvShaderTranslator::ProcessScalarAluOperation( - const ParsedAluInstruction& instr, bool& predicate_written) { + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before, bool& predicate_written) { predicate_written = false; spv::Id operand_storage[2] = {}; @@ -1257,12 +1265,13 @@ spv::Id SpirvShaderTranslator::ProcessScalarAluOperation( case ucode::AluScalarOpcode::kKillsNe: case ucode::AluScalarOpcode::kKillsOne: { KillPixel(builder_->createBinOp( - spv::Op(kOps[size_t(instr.scalar_opcode)]), type_bool_, - GetOperandComponents(operand_storage[0], instr.scalar_operands[0], - 0b0001), - instr.scalar_opcode == ucode::AluScalarOpcode::kKillsOne - ? const_float_1_ - : const_float_0_)); + spv::Op(kOps[size_t(instr.scalar_opcode)]), type_bool_, + GetOperandComponents(operand_storage[0], + instr.scalar_operands[0], 0b0001), + instr.scalar_opcode == ucode::AluScalarOpcode::kKillsOne + ? const_float_1_ + : const_float_0_), + memexport_eM_potentially_written_before); return const_float_0_; } diff --git a/src/xenia/gpu/spirv_shader_translator_memexport.cc b/src/xenia/gpu/spirv_shader_translator_memexport.cc new file mode 100644 index 000000000..94c0adf54 --- /dev/null +++ b/src/xenia/gpu/spirv_shader_translator_memexport.cc @@ -0,0 +1,950 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/spirv_shader_translator.h" + +#include +#include +#include +#include +#include +#include + +#include "third_party/glslang/SPIRV/GLSL.std.450.h" +#include "xenia/base/assert.h" +#include "xenia/base/math.h" +#include "xenia/gpu/ucode.h" + +namespace xe { +namespace gpu { + +void SpirvShaderTranslator::ExportToMemory(uint8_t export_eM) { + if (!export_eM) { + return; + } + + assert_zero(export_eM & ~current_shader().memexport_eM_written()); + + if (!IsMemoryExportSupported()) { + return; + } + + // Check if memory export is allowed in this guest shader invocation. + std::optional if_memexport_allowed; + if (main_memexport_allowed_ != spv::NoResult) { + if_memexport_allowed.emplace(main_memexport_allowed_, + spv::SelectionControlDontFlattenMask, + *builder_); + } + + // If the pixel was killed (but the actual killing on the SPIR-V side has not + // been performed yet because the device doesn't support demotion to helper + // invocation that doesn't interfere with control flow), the current + // invocation is not considered active anymore. + std::optional if_pixel_not_killed; + if (var_main_kill_pixel_ != spv::NoResult) { + if_pixel_not_killed.emplace( + builder_->createUnaryOp( + spv::OpLogicalNot, type_bool_, + builder_->createLoad(var_main_kill_pixel_, spv::NoPrecision)), + spv::SelectionControlDontFlattenMask, *builder_); + } + + // Check if the address with the correct sign and exponent was written, and + // that the index doesn't overflow the mantissa bits. + // all((eA_vector >> uvec4(30, 23, 23, 23)) == uvec4(0x1, 0x96, 0x96, 0x96)) + spv::Id eA_vector = builder_->createUnaryOp( + spv::OpBitcast, type_uint4_, + builder_->createLoad(var_main_memexport_address_, spv::NoPrecision)); + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeUintConstant(30)); + id_vector_temp_.push_back(builder_->makeUintConstant(23)); + id_vector_temp_.push_back(id_vector_temp_.back()); + id_vector_temp_.push_back(id_vector_temp_.back()); + spv::Id address_validation_shift = + builder_->makeCompositeConstant(type_uint4_, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeUintConstant(0x1)); + id_vector_temp_.push_back(builder_->makeUintConstant(0x96)); + id_vector_temp_.push_back(id_vector_temp_.back()); + id_vector_temp_.push_back(id_vector_temp_.back()); + spv::Id address_validation_value = + builder_->makeCompositeConstant(type_uint4_, id_vector_temp_); + SpirvBuilder::IfBuilder if_address_valid( + builder_->createUnaryOp( + spv::OpAll, type_bool_, + builder_->createBinOp( + spv::OpIEqual, type_bool4_, + builder_->createBinOp(spv::OpShiftRightLogical, type_uint4_, + eA_vector, address_validation_shift), + address_validation_value)), + spv::SelectionControlDontFlattenMask, *builder_, 2, 1); + + using EMIdArray = std::array; + + auto for_each_eM = [&](std::function fn) { + uint8_t eM_remaining = export_eM; + uint32_t eM_index; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + fn(eM_index); + } + }; + + // Load the original eM. + EMIdArray eM_original; + for_each_eM([&](uint32_t eM_index) { + eM_original[eM_index] = builder_->createLoad( + var_main_memexport_data_[eM_index], spv::NoPrecision); + }); + + // Swap red and blue if needed. + spv::Id format_info = + builder_->createCompositeExtract(eA_vector, type_uint_, 2); + spv::Id swap_red_blue = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, format_info, + builder_->makeUintConstant(uint32_t(1) << 19)), + const_uint_0_); + EMIdArray eM_swapped; + uint_vector_temp_.clear(); + uint_vector_temp_.push_back(2); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(3); + for_each_eM([&](uint32_t eM_index) { + eM_swapped[eM_index] = builder_->createTriOp( + spv::OpSelect, type_float4_, swap_red_blue, + builder_->createRvalueSwizzle(spv::NoPrecision, type_float4_, + eM_original[eM_index], uint_vector_temp_), + eM_original[eM_index]); + }); + + // Extract the numeric format. + spv::Id is_signed = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, format_info, + builder_->makeUintConstant(uint32_t(1) << 16)), + const_uint_0_); + spv::Id is_norm = builder_->createBinOp( + spv::OpIEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, format_info, + builder_->makeUintConstant(uint32_t(1) << 17)), + const_uint_0_); + + // Perform format packing. + + auto flush_nan = [&](const EMIdArray& eM) -> EMIdArray { + EMIdArray eM_flushed; + for_each_eM([&](uint32_t eM_index) { + spv::Id element_unflushed = eM[eM_index]; + unsigned int component_count = + builder_->getNumComponents(element_unflushed); + eM_flushed[eM_index] = builder_->createTriOp( + spv::OpSelect, type_float_vectors_[component_count - 1], + builder_->createUnaryOp(spv::OpIsNan, + type_bool_vectors_[component_count - 1], + element_unflushed), + const_float_vectors_0_[component_count - 1], element_unflushed); + }); + return eM_flushed; + }; + + auto make_float_constant_vectors = + [&](float value) -> std::array { + std::array const_vectors; + const_vectors[0] = builder_->makeFloatConstant(value); + id_vector_temp_.clear(); + id_vector_temp_.push_back(const_vectors[0]); + for (unsigned int component_count_minus_1 = 1; component_count_minus_1 < 4; + ++component_count_minus_1) { + id_vector_temp_.push_back(const_vectors[0]); + const_vectors[component_count_minus_1] = builder_->makeCompositeConstant( + type_float_vectors_[component_count_minus_1], id_vector_temp_); + } + return const_vectors; + }; + std::array const_float_vectors_minus_1 = + make_float_constant_vectors(-1.0f); + std::array const_float_vectors_minus_0_5 = + make_float_constant_vectors(-0.5f); + std::array const_float_vectors_0_5 = + make_float_constant_vectors(0.5f); + + // The widths must be without holes (R, RG, RGB, RGBA), and expecting the + // widths to add up to the size of the stored texel (8, 16 or 32 bits), as the + // unused upper bits will contain junk from the sign extension of X if the + // number is signed. + auto pack_8_16_32 = [&](std::array widths) -> EMIdArray { + unsigned int component_count; + std::array offsets{}; + for (component_count = 0; component_count < widths.size(); + ++component_count) { + if (!widths[component_count]) { + break; + } + // Only formats for which max + 0.5 can be represented exactly. + assert(widths[component_count] <= 23); + if (component_count) { + offsets[component_count] = + offsets[component_count - 1] + widths[component_count - 1]; + } + } + assert_not_zero(component_count); + + // Extract the needed components. + EMIdArray eM_unflushed = eM_swapped; + if (component_count < 4) { + if (component_count == 1) { + for_each_eM([&](uint32_t eM_index) { + eM_unflushed[eM_index] = builder_->createCompositeExtract( + eM_unflushed[eM_index], type_float_, 0); + }); + } else { + uint_vector_temp_.clear(); + for (unsigned int component_index = 0; + component_index < component_count; ++component_index) { + uint_vector_temp_.push_back(component_index); + } + for_each_eM([&](uint32_t eM_index) { + eM_unflushed[eM_index] = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float_vectors_[component_count - 1], + eM_unflushed[eM_index], uint_vector_temp_); + }); + } + } + + // Flush NaNs. + EMIdArray eM_flushed = flush_nan(eM_unflushed); + + // Convert to integers. + SpirvBuilder::IfBuilder if_signed( + is_signed, spv::SelectionControlDontFlattenMask, *builder_); + EMIdArray eM_signed; + { + // Signed. + SpirvBuilder::IfBuilder if_norm( + is_norm, spv::SelectionControlDontFlattenMask, *builder_); + EMIdArray eM_norm; + { + // Signed normalized. + id_vector_temp_.clear(); + for (unsigned int component_index = 0; + component_index < component_count; ++component_index) { + id_vector_temp_.push_back(builder_->makeFloatConstant( + float((uint32_t(1) << (widths[component_index] - 1)) - 1))); + } + spv::Id const_max_value = + component_count > 1 + ? builder_->makeCompositeConstant( + type_float_vectors_[component_count - 1], id_vector_temp_) + : id_vector_temp_.front(); + for_each_eM([&](uint32_t eM_index) { + eM_norm[eM_index] = builder_->createNoContractionBinOp( + spv::OpFMul, type_float_vectors_[component_count - 1], + builder_->createTriBuiltinCall( + type_float_vectors_[component_count - 1], + ext_inst_glsl_std_450_, GLSLstd450FClamp, + eM_flushed[eM_index], + const_float_vectors_minus_1[component_count - 1], + const_float_vectors_1_[component_count - 1]), + const_max_value); + }); + } + if_norm.makeEndIf(); + // All phi instructions must be in the beginning of the block. + for_each_eM([&](uint32_t eM_index) { + eM_signed[eM_index] = + if_norm.createMergePhi(eM_norm[eM_index], eM_flushed[eM_index]); + }); + // Convert to signed integer, adding plus/minus 0.5 before truncating + // according to the Direct3D format conversion rules. + for_each_eM([&](uint32_t eM_index) { + eM_signed[eM_index] = builder_->createUnaryOp( + spv::OpBitcast, type_uint_vectors_[component_count - 1], + builder_->createUnaryOp( + spv::OpConvertFToS, type_int_vectors_[component_count - 1], + builder_->createNoContractionBinOp( + spv::OpFAdd, type_float_vectors_[component_count - 1], + eM_signed[eM_index], + builder_->createTriOp( + spv::OpSelect, type_float_vectors_[component_count - 1], + builder_->createBinOp( + spv::OpFOrdLessThan, + type_bool_vectors_[component_count - 1], + eM_signed[eM_index], + const_float_vectors_0_[component_count - 1]), + const_float_vectors_minus_0_5[component_count - 1], + const_float_vectors_0_5[component_count - 1])))); + }); + } + if_signed.makeBeginElse(); + EMIdArray eM_unsigned; + { + SpirvBuilder::IfBuilder if_norm( + is_norm, spv::SelectionControlDontFlattenMask, *builder_); + EMIdArray eM_norm; + { + // Unsigned normalized. + id_vector_temp_.clear(); + for (unsigned int component_index = 0; + component_index < component_count; ++component_index) { + id_vector_temp_.push_back(builder_->makeFloatConstant( + float((uint32_t(1) << widths[component_index]) - 1))); + } + spv::Id const_max_value = + component_count > 1 + ? builder_->makeCompositeConstant( + type_float_vectors_[component_count - 1], id_vector_temp_) + : id_vector_temp_.front(); + for_each_eM([&](uint32_t eM_index) { + eM_norm[eM_index] = builder_->createNoContractionBinOp( + spv::OpFMul, type_float_vectors_[component_count - 1], + builder_->createTriBuiltinCall( + type_float_vectors_[component_count - 1], + ext_inst_glsl_std_450_, GLSLstd450FClamp, + eM_flushed[eM_index], + const_float_vectors_0_[component_count - 1], + const_float_vectors_1_[component_count - 1]), + const_max_value); + }); + } + if_norm.makeEndIf(); + // All phi instructions must be in the beginning of the block. + for_each_eM([&](uint32_t eM_index) { + eM_unsigned[eM_index] = + if_norm.createMergePhi(eM_norm[eM_index], eM_flushed[eM_index]); + }); + // Convert to unsigned integer, adding 0.5 before truncating according to + // the Direct3D format conversion rules. + for_each_eM([&](uint32_t eM_index) { + eM_unsigned[eM_index] = builder_->createUnaryOp( + spv::OpConvertFToU, type_uint_vectors_[component_count - 1], + builder_->createNoContractionBinOp( + spv::OpFAdd, type_float_vectors_[component_count - 1], + eM_unsigned[eM_index], + const_float_vectors_0_5[component_count - 1])); + }); + } + if_signed.makeEndIf(); + EMIdArray eM_unpacked; + for_each_eM([&](uint32_t eM_index) { + eM_unpacked[eM_index] = + if_signed.createMergePhi(eM_signed[eM_index], eM_unsigned[eM_index]); + }); + + // Pack into a 32-bit value, and pad to a 4-component vector for the phi. + EMIdArray eM_packed; + for_each_eM([&](uint32_t eM_index) { + spv::Id element_unpacked = eM_unpacked[eM_index]; + eM_packed[eM_index] = component_count > 1 + ? builder_->createCompositeExtract( + element_unpacked, type_uint_, 0) + : element_unpacked; + for (unsigned int component_index = 1; component_index < component_count; + ++component_index) { + eM_packed[eM_index] = builder_->createQuadOp( + spv::OpBitFieldInsert, type_uint_, eM_packed[eM_index], + builder_->createCompositeExtract(element_unpacked, type_uint_, + component_index), + builder_->makeUintConstant(offsets[component_index]), + builder_->makeUintConstant(widths[component_index])); + } + id_vector_temp_.clear(); + id_vector_temp_.resize(4, const_uint_0_); + id_vector_temp_.front() = eM_packed[eM_index]; + eM_packed[eM_index] = + builder_->createCompositeConstruct(type_uint4_, id_vector_temp_); + }); + + return eM_packed; + }; + + SpirvBuilder::SwitchBuilder format_switch( + builder_->createTriOp(spv::OpBitFieldUExtract, type_uint_, format_info, + builder_->makeUintConstant(8), + builder_->makeUintConstant(6)), + spv::SelectionControlDontFlattenMask, *builder_); + + struct FormatCase { + EMIdArray eM_packed; + uint32_t element_bytes_log2; + spv::Id phi_parent; + }; + std::vector format_cases; + // Must be called at the end of the switch case segment for the correct phi + // parent. + auto add_format_case = [&](const EMIdArray& eM_packed, + uint32_t element_bytes_log2) { + FormatCase& format_case = format_cases.emplace_back(); + format_case.eM_packed = eM_packed; + format_case.element_bytes_log2 = element_bytes_log2; + format_case.phi_parent = builder_->getBuildPoint()->getId(); + }; + + // k_8, k_8_A, k_8_B + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_8)); + // TODO(Triang3l): Investigate how input should be treated for k_8_A, k_8_B. + format_switch.addCurrentCaseLiteral( + static_cast(xenos::ColorFormat::k_8_A)); + format_switch.addCurrentCaseLiteral( + static_cast(xenos::ColorFormat::k_8_B)); + add_format_case(pack_8_16_32({8}), 0); + + // k_1_5_5_5 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_1_5_5_5)); + add_format_case(pack_8_16_32({5, 5, 5, 1}), 1); + + // k_5_6_5 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_5_6_5)); + add_format_case(pack_8_16_32({5, 6, 5}), 1); + + // k_6_5_5 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_6_5_5)); + add_format_case(pack_8_16_32({5, 5, 6}), 1); + + // k_8_8_8_8, k_8_8_8_8_A, k_8_8_8_8_AS_16_16_16_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_8_8_8_8)); + // TODO(Triang3l): Investigate how input should be treated for k_8_8_8_8_A. + format_switch.addCurrentCaseLiteral( + static_cast(xenos::ColorFormat::k_8_8_8_8_A)); + format_switch.addCurrentCaseLiteral( + static_cast(xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16)); + add_format_case(pack_8_16_32({8, 8, 8, 8}), 2); + + // k_2_10_10_10, k_2_10_10_10_AS_16_16_16_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_2_10_10_10)); + format_switch.addCurrentCaseLiteral(static_cast( + xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16)); + add_format_case(pack_8_16_32({10, 10, 10, 2}), 2); + + // k_8_8 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_8_8)); + add_format_case(pack_8_16_32({8, 8}), 1); + + // k_4_4_4_4 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_4_4_4_4)); + add_format_case(pack_8_16_32({4, 4, 4, 4}), 1); + + // k_10_11_11, k_10_11_11_AS_16_16_16_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_10_11_11)); + format_switch.addCurrentCaseLiteral( + static_cast(xenos::ColorFormat::k_10_11_11_AS_16_16_16_16)); + add_format_case(pack_8_16_32({11, 11, 10}), 2); + + // k_11_11_10, k_11_11_10_AS_16_16_16_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_11_11_10)); + format_switch.addCurrentCaseLiteral( + static_cast(xenos::ColorFormat::k_11_11_10_AS_16_16_16_16)); + add_format_case(pack_8_16_32({10, 11, 11}), 2); + + // k_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_16)); + add_format_case(pack_8_16_32({16}), 1); + + // k_16_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_16_16)); + add_format_case(pack_8_16_32({16, 16}), 2); + + // k_16_16_16_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_16_16_16_16)); + { + // Flush NaNs. + EMIdArray fixed16_flushed = flush_nan(eM_swapped); + + // Convert to integers. + SpirvBuilder::IfBuilder if_signed( + is_signed, spv::SelectionControlDontFlattenMask, *builder_); + EMIdArray fixed16_signed; + { + // Signed. + SpirvBuilder::IfBuilder if_norm( + is_norm, spv::SelectionControlDontFlattenMask, *builder_); + EMIdArray fixed16_norm; + { + // Signed normalized. + id_vector_temp_.clear(); + id_vector_temp_.resize(4, builder_->makeFloatConstant( + float((uint32_t(1) << (16 - 1)) - 1))); + spv::Id const_snorm16_max_value = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + for_each_eM([&](uint32_t eM_index) { + fixed16_norm[eM_index] = builder_->createNoContractionBinOp( + spv::OpFMul, type_float4_, + builder_->createTriBuiltinCall( + type_float4_, ext_inst_glsl_std_450_, GLSLstd450FClamp, + fixed16_flushed[eM_index], const_float_vectors_minus_1[3], + const_float4_1_), + const_snorm16_max_value); + }); + } + if_norm.makeEndIf(); + // All phi instructions must be in the beginning of the block. + for_each_eM([&](uint32_t eM_index) { + fixed16_signed[eM_index] = if_norm.createMergePhi( + fixed16_norm[eM_index], fixed16_flushed[eM_index]); + }); + // Convert to signed integer, adding plus/minus 0.5 before truncating + // according to the Direct3D format conversion rules. + for_each_eM([&](uint32_t eM_index) { + fixed16_signed[eM_index] = builder_->createUnaryOp( + spv::OpBitcast, type_uint4_, + builder_->createUnaryOp( + spv::OpConvertFToS, type_int4_, + builder_->createNoContractionBinOp( + spv::OpFAdd, type_float4_, fixed16_signed[eM_index], + builder_->createTriOp( + spv::OpSelect, type_float4_, + builder_->createBinOp(spv::OpFOrdLessThan, type_bool4_, + fixed16_signed[eM_index], + const_float4_0_), + const_float_vectors_minus_0_5[3], + const_float_vectors_0_5[3])))); + }); + } + if_signed.makeBeginElse(); + EMIdArray fixed16_unsigned; + { + // Unsigned. + SpirvBuilder::IfBuilder if_norm( + is_norm, spv::SelectionControlDontFlattenMask, *builder_); + EMIdArray fixed16_norm; + { + // Unsigned normalized. + id_vector_temp_.clear(); + id_vector_temp_.resize( + 4, builder_->makeFloatConstant(float((uint32_t(1) << 16) - 1))); + spv::Id const_unorm16_max_value = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + for_each_eM([&](uint32_t eM_index) { + fixed16_norm[eM_index] = builder_->createNoContractionBinOp( + spv::OpFMul, type_float4_, + builder_->createTriBuiltinCall( + type_float4_, ext_inst_glsl_std_450_, GLSLstd450FClamp, + fixed16_flushed[eM_index], const_float4_0_, const_float4_1_), + const_unorm16_max_value); + }); + } + if_norm.makeEndIf(); + // All phi instructions must be in the beginning of the block. + for_each_eM([&](uint32_t eM_index) { + fixed16_unsigned[eM_index] = if_norm.createMergePhi( + fixed16_norm[eM_index], fixed16_flushed[eM_index]); + }); + // Convert to unsigned integer, adding 0.5 before truncating according to + // the Direct3D format conversion rules. + for_each_eM([&](uint32_t eM_index) { + fixed16_unsigned[eM_index] = builder_->createUnaryOp( + spv::OpConvertFToU, type_uint4_, + builder_->createNoContractionBinOp(spv::OpFAdd, type_float4_, + fixed16_unsigned[eM_index], + const_float_vectors_0_5[3])); + }); + } + if_signed.makeEndIf(); + EMIdArray fixed16_unpacked; + for_each_eM([&](uint32_t eM_index) { + fixed16_unpacked[eM_index] = if_signed.createMergePhi( + fixed16_signed[eM_index], fixed16_unsigned[eM_index]); + }); + + // Pack into two 32-bit values, and pad to a 4-component vector for the phi. + EMIdArray fixed16_packed; + spv::Id const_uint_16 = builder_->makeUintConstant(16); + for_each_eM([&](uint32_t eM_index) { + spv::Id fixed16_element_unpacked = fixed16_unpacked[eM_index]; + id_vector_temp_.clear(); + for (uint32_t component_index = 0; component_index < 2; + ++component_index) { + id_vector_temp_.push_back(builder_->createQuadOp( + spv::OpBitFieldInsert, type_uint_, + builder_->createCompositeExtract(fixed16_element_unpacked, + type_uint_, 2 * component_index), + builder_->createCompositeExtract( + fixed16_element_unpacked, type_uint_, 2 * component_index + 1), + const_uint_16, const_uint_16)); + } + for (uint32_t component_index = 2; component_index < 4; + ++component_index) { + id_vector_temp_.push_back(const_uint_0_); + } + fixed16_packed[eM_index] = + builder_->createCompositeConstruct(type_uint4_, id_vector_temp_); + }); + + add_format_case(fixed16_packed, 3); + } + + // TODO(Triang3l): Use the extended range float16 conversion. + + // k_16_FLOAT + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_16_FLOAT)); + { + EMIdArray format_packed_16_float; + for_each_eM([&](uint32_t eM_index) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->createCompositeExtract( + eM_swapped[eM_index], type_float_, 0)); + id_vector_temp_.push_back(const_float_0_); + spv::Id format_packed_16_float_x = builder_->createUnaryBuiltinCall( + type_uint_, ext_inst_glsl_std_450_, GLSLstd450PackHalf2x16, + builder_->createCompositeConstruct(type_float2_, id_vector_temp_)); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, const_uint_0_); + id_vector_temp_.front() = format_packed_16_float_x; + format_packed_16_float[eM_index] = + builder_->createCompositeConstruct(type_uint4_, id_vector_temp_); + }); + add_format_case(format_packed_16_float, 1); + } + + // k_16_16_FLOAT + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_16_16_FLOAT)); + { + EMIdArray format_packed_16_16_float; + for_each_eM([&](uint32_t eM_index) { + uint_vector_temp_.clear(); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + spv::Id format_packed_16_16_float_xy = builder_->createUnaryBuiltinCall( + type_uint_, ext_inst_glsl_std_450_, GLSLstd450PackHalf2x16, + builder_->createRvalueSwizzle(spv::NoPrecision, type_float2_, + eM_swapped[eM_index], + uint_vector_temp_)); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, const_uint_0_); + id_vector_temp_.front() = format_packed_16_16_float_xy; + format_packed_16_16_float[eM_index] = + builder_->createCompositeConstruct(type_uint4_, id_vector_temp_); + }); + add_format_case(format_packed_16_16_float, 2); + } + + // k_16_16_16_16_FLOAT + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_16_16_16_16_FLOAT)); + { + EMIdArray format_packed_16_16_16_16_float; + for_each_eM([&](uint32_t eM_index) { + spv::Id format_packed_16_16_16_16_float_xy_zw[2]; + for (uint32_t component_index = 0; component_index < 2; + ++component_index) { + uint_vector_temp_.clear(); + uint_vector_temp_.push_back(2 * component_index); + uint_vector_temp_.push_back(2 * component_index + 1); + format_packed_16_16_16_16_float_xy_zw[component_index] = + builder_->createUnaryBuiltinCall( + type_uint_, ext_inst_glsl_std_450_, GLSLstd450PackHalf2x16, + builder_->createRvalueSwizzle(spv::NoPrecision, type_float2_, + eM_swapped[eM_index], + uint_vector_temp_)); + } + id_vector_temp_.clear(); + id_vector_temp_.push_back(format_packed_16_16_16_16_float_xy_zw[0]); + id_vector_temp_.push_back(format_packed_16_16_16_16_float_xy_zw[1]); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(const_uint_0_); + format_packed_16_16_16_16_float[eM_index] = + builder_->createCompositeConstruct(type_uint4_, id_vector_temp_); + }); + add_format_case(format_packed_16_16_16_16_float, 3); + } + + // k_32_FLOAT + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_32_FLOAT)); + { + EMIdArray format_packed_32_float; + for_each_eM([&](uint32_t eM_index) { + format_packed_32_float[eM_index] = builder_->createUnaryOp( + spv::OpBitcast, type_uint4_, eM_swapped[eM_index]); + }); + add_format_case(format_packed_32_float, 2); + } + + // k_32_32_FLOAT + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_32_32_FLOAT)); + { + EMIdArray format_packed_32_32_float; + for_each_eM([&](uint32_t eM_index) { + format_packed_32_32_float[eM_index] = builder_->createUnaryOp( + spv::OpBitcast, type_uint4_, eM_swapped[eM_index]); + }); + add_format_case(format_packed_32_32_float, 3); + } + + // k_32_32_32_32_FLOAT + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_32_32_32_32_FLOAT)); + { + EMIdArray format_packed_32_32_32_32_float; + for_each_eM([&](uint32_t eM_index) { + format_packed_32_32_32_32_float[eM_index] = builder_->createUnaryOp( + spv::OpBitcast, type_uint4_, eM_swapped[eM_index]); + }); + add_format_case(format_packed_32_32_32_32_float, 4); + } + + format_switch.makeEndSwitch(); + + // Select the result and the element size based on the format. + // Phi must be the first instructions in a block. + EMIdArray eM_packed; + for_each_eM([&](uint32_t eM_index) { + auto eM_packed_phi = std::make_unique( + builder_->getUniqueId(), type_uint4_, spv::OpPhi); + // Default case for an invalid format. + eM_packed_phi->addIdOperand(const_uint4_0_); + eM_packed_phi->addIdOperand(format_switch.getDefaultPhiParent()); + for (const FormatCase& format_case : format_cases) { + eM_packed_phi->addIdOperand(format_case.eM_packed[eM_index]); + eM_packed_phi->addIdOperand(format_case.phi_parent); + } + eM_packed[eM_index] = eM_packed_phi->getResultId(); + builder_->getBuildPoint()->addInstruction(std::move(eM_packed_phi)); + }); + spv::Id element_bytes_log2; + { + auto element_bytes_log2_phi = std::make_unique( + builder_->getUniqueId(), type_uint_, spv::OpPhi); + // Default case for an invalid format (doesn't enter any element size + // conditional, skipped). + element_bytes_log2_phi->addIdOperand(builder_->makeUintConstant(5)); + element_bytes_log2_phi->addIdOperand(format_switch.getDefaultPhiParent()); + for (const FormatCase& format_case : format_cases) { + element_bytes_log2_phi->addIdOperand( + builder_->makeUintConstant(format_case.element_bytes_log2)); + element_bytes_log2_phi->addIdOperand(format_case.phi_parent); + } + element_bytes_log2 = element_bytes_log2_phi->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(element_bytes_log2_phi)); + } + + // Endian-swap. + spv::Id endian = + builder_->createTriOp(spv::OpBitFieldUExtract, type_uint_, format_info, + const_uint_0_, builder_->makeUintConstant(3)); + for_each_eM([&](uint32_t eM_index) { + eM_packed[eM_index] = EndianSwap128Uint4(eM_packed[eM_index], endian); + }); + + // Load the index of eM0 in the stream. + spv::Id eM0_index = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, + builder_->createCompositeExtract(eA_vector, type_uint_, 1), const_uint_0_, + builder_->makeUintConstant(23)); + + // Check how many elements starting from eM0 are within the bounds of the + // stream, and from the eM# that were written, exclude the out-of-bounds ones. + // The index can't be negative, and the index and the count are limited to 23 + // bits, so it's safe to use 32-bit signed subtraction and clamping to get the + // remaining eM# count. + spv::Id eM_indices_to_store = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, + builder_->createLoad(var_main_memexport_data_written_, spv::NoPrecision), + const_uint_0_, + builder_->createUnaryOp( + spv::OpBitcast, type_uint_, + builder_->createTriBuiltinCall( + type_int_, ext_inst_glsl_std_450_, GLSLstd450SClamp, + builder_->createBinOp( + spv::OpISub, type_int_, + builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createTriOp(spv::OpBitFieldUExtract, type_uint_, + builder_->createCompositeExtract( + eA_vector, type_uint_, 3), + const_uint_0_, + builder_->makeUintConstant(23))), + builder_->createUnaryOp(spv::OpBitcast, type_int_, + eM0_index)), + const_int_0_, + builder_->makeIntConstant(ucode::kMaxMemExportElementCount)))); + + // Get the eM0 address in bytes. + // Left-shift the stream base address by 2 to both convert it from dwords to + // bytes and drop the upper bits. + spv::Id const_uint_2 = builder_->makeUintConstant(2); + spv::Id eM0_address_bytes = builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp( + spv::OpShiftLeftLogical, type_uint_, + builder_->createCompositeExtract(eA_vector, type_uint_, 0), + const_uint_2), + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, eM0_index, + element_bytes_log2)); + + // Store based on the element size. + auto store_needed_eM = [&](std::function fn) { + for_each_eM([&](uint32_t eM_index) { + SpirvBuilder::IfBuilder if_eM_needed( + builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + eM_indices_to_store, + builder_->makeUintConstant(1u << eM_index)), + const_uint_0_), + spv::SelectionControlDontFlattenMask, *builder_, 2, 1); + fn(eM_index); + if_eM_needed.makeEndIf(); + }); + }; + SpirvBuilder::SwitchBuilder element_size_switch( + element_bytes_log2, spv::SelectionControlDontFlattenMask, *builder_); + element_size_switch.makeBeginCase(0); + { + store_needed_eM([&](uint32_t eM_index) { + spv::Id element_address_bytes = + eM_index != 0 ? builder_->createBinOp( + spv::OpIAdd, type_uint_, eM0_address_bytes, + builder_->makeUintConstant(eM_index)) + : eM0_address_bytes; + // replace_shift = 8 * (element_address_bytes & 3) + spv::Id replace_shift = builder_->createQuadOp( + spv::OpBitFieldInsert, type_uint_, const_uint_0_, + element_address_bytes, builder_->makeUintConstant(3), const_uint_2); + StoreUint32ToSharedMemory( + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, + builder_->createCompositeExtract( + eM_packed[eM_index], type_uint_, 0), + replace_shift), + builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createBinOp(spv::OpShiftRightLogical, type_uint_, + element_address_bytes, const_uint_2)), + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, + builder_->makeUintConstant(0xFFu), + replace_shift)); + }); + } + element_size_switch.makeBeginCase(1); + { + spv::Id const_uint_1 = builder_->makeUintConstant(1); + spv::Id eM0_address_words = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, eM0_address_bytes, const_uint_1); + store_needed_eM([&](uint32_t eM_index) { + spv::Id element_address_words = + eM_index != 0 ? builder_->createBinOp( + spv::OpIAdd, type_uint_, eM0_address_words, + builder_->makeUintConstant(eM_index)) + : eM0_address_words; + // replace_shift = 16 * (element_address_words & 1) + spv::Id replace_shift = builder_->createQuadOp( + spv::OpBitFieldInsert, type_uint_, const_uint_0_, + element_address_words, builder_->makeUintConstant(4), const_uint_1); + StoreUint32ToSharedMemory( + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, + builder_->createCompositeExtract( + eM_packed[eM_index], type_uint_, 0), + replace_shift), + builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createBinOp(spv::OpShiftRightLogical, type_uint_, + element_address_words, const_uint_1)), + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, + builder_->makeUintConstant(0xFFFFu), + replace_shift)); + }); + } + element_size_switch.makeBeginCase(2); + { + spv::Id eM0_address_dwords = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, eM0_address_bytes, const_uint_2); + store_needed_eM([&](uint32_t eM_index) { + StoreUint32ToSharedMemory( + builder_->createCompositeExtract(eM_packed[eM_index], type_uint_, 0), + builder_->createUnaryOp( + spv::OpBitcast, type_int_, + eM_index != 0 ? builder_->createBinOp( + spv::OpIAdd, type_uint_, eM0_address_dwords, + builder_->makeUintConstant(eM_index)) + : eM0_address_dwords)); + }); + } + element_size_switch.makeBeginCase(3); + { + spv::Id eM0_address_dwords = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, eM0_address_bytes, const_uint_2); + store_needed_eM([&](uint32_t eM_index) { + spv::Id element_value = eM_packed[eM_index]; + spv::Id element_address_dwords_int = builder_->createUnaryOp( + spv::OpBitcast, type_int_, + eM_index != 0 ? builder_->createBinOp( + spv::OpIAdd, type_uint_, eM0_address_dwords, + builder_->makeUintConstant(2 * eM_index)) + : eM0_address_dwords); + StoreUint32ToSharedMemory( + builder_->createCompositeExtract(element_value, type_uint_, 0), + element_address_dwords_int); + StoreUint32ToSharedMemory( + builder_->createCompositeExtract(element_value, type_uint_, 1), + builder_->createBinOp(spv::OpIAdd, type_int_, + element_address_dwords_int, + builder_->makeIntConstant(1))); + }); + } + element_size_switch.makeBeginCase(4); + { + spv::Id eM0_address_dwords = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, eM0_address_bytes, const_uint_2); + store_needed_eM([&](uint32_t eM_index) { + spv::Id element_value = eM_packed[eM_index]; + spv::Id element_address_dwords_int = builder_->createUnaryOp( + spv::OpBitcast, type_int_, + eM_index != 0 ? builder_->createBinOp( + spv::OpIAdd, type_uint_, eM0_address_dwords, + builder_->makeUintConstant(4 * eM_index)) + : eM0_address_dwords); + StoreUint32ToSharedMemory( + builder_->createCompositeExtract(element_value, type_uint_, 0), + element_address_dwords_int); + for (uint32_t element_dword_index = 1; element_dword_index < 4; + ++element_dword_index) { + StoreUint32ToSharedMemory( + builder_->createCompositeExtract(element_value, type_uint_, + element_dword_index), + builder_->createBinOp(spv::OpIAdd, type_int_, + element_address_dwords_int, + builder_->makeIntConstant( + static_cast(element_dword_index)))); + } + }); + } + element_size_switch.makeEndSwitch(); + + // Close the conditionals for whether memory export is allowed in this + // invocation. + if_address_valid.makeEndIf(); + if (if_pixel_not_killed.has_value()) { + if_pixel_not_killed->makeEndIf(); + } + if (if_memexport_allowed.has_value()) { + if_memexport_allowed->makeEndIf(); + } +} + +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 806382e00..b2af47f30 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -2165,6 +2165,11 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, return IssueCopy(); } + const ui::vulkan::VulkanProvider::DeviceInfo& device_info = + GetVulkanProvider().device_info(); + + memexport_ranges_.clear(); + // Vertex shader analysis. auto vertex_shader = static_cast(active_vertex_shader()); if (!vertex_shader) { @@ -2172,7 +2177,14 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, return false; } pipeline_cache_->AnalyzeShaderUcode(*vertex_shader); - bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0; + // TODO(Triang3l): If the shader uses memory export, but + // vertexPipelineStoresAndAtomics is not supported, convert the vertex shader + // to a compute shader and dispatch it after the draw if the draw doesn't use + // tessellation. + if (vertex_shader->memexport_eM_written() != 0 && + device_info.vertexPipelineStoresAndAtomics) { + draw_util::AddMemExportRanges(regs, *vertex_shader, memexport_ranges_); + } // Pixel shader analysis. bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs); @@ -2195,12 +2207,15 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, } else { // Disabling pixel shader for this case is also required by the pipeline // cache. - if (!memexport_used_vertex) { + if (memexport_ranges_.empty()) { // This draw has no effect. return true; } } - // TODO(Triang3l): Memory export. + if (pixel_shader && pixel_shader->memexport_eM_written() != 0 && + device_info.fragmentStoresAndAtomics) { + draw_util::AddMemExportRanges(regs, *pixel_shader, memexport_ranges_); + } uint32_t ps_param_gen_pos = UINT32_MAX; uint32_t interpolator_mask = @@ -2416,9 +2431,6 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, current_guest_graphics_pipeline_layout_ = pipeline_layout; } - const ui::vulkan::VulkanProvider::DeviceInfo& device_info = - GetVulkanProvider().device_info(); - bool host_render_targets_used = render_target_cache_->GetPath() == RenderTargetCache::Path::kHostRenderTargets; @@ -2520,9 +2532,39 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, << (vfetch_index & 63); } + // Synchronize the memory pages backing memory scatter export streams, and + // calculate the range that includes the streams for the buffer barrier. + uint32_t memexport_extent_start = UINT32_MAX, memexport_extent_end = 0; + for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) { + uint32_t memexport_range_base_bytes = memexport_range.base_address_dwords + << 2; + if (!shared_memory_->RequestRange(memexport_range_base_bytes, + memexport_range.size_bytes)) { + XELOGE( + "Failed to request memexport stream at 0x{:08X} (size {}) in the " + "shared memory", + memexport_range_base_bytes, memexport_range.size_bytes); + return false; + } + memexport_extent_start = + std::min(memexport_extent_start, memexport_range_base_bytes); + memexport_extent_end = + std::max(memexport_extent_end, + memexport_range_base_bytes + memexport_range.size_bytes); + } + // Insert the shared memory barrier if needed. - // TODO(Triang3l): Memory export. - shared_memory_->Use(VulkanSharedMemory::Usage::kRead); + // TODO(Triang3l): Find some PM4 command that can be used for indication of + // when memexports should be awaited instead of inserting the barrier in Use + // every time if memory export was done in the previous draw? + if (memexport_extent_start < memexport_extent_end) { + shared_memory_->Use( + VulkanSharedMemory::Usage::kGuestDrawReadWrite, + std::make_pair(memexport_extent_start, + memexport_extent_end - memexport_extent_start)); + } else { + shared_memory_->Use(VulkanSharedMemory::Usage::kRead); + } // After all commands that may dispatch, copy or insert barriers, submit the // barriers (may end the render pass), and (re)enter the render pass before @@ -2567,6 +2609,12 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, primitive_processing_result.host_draw_vertex_count, 1, 0, 0, 0); } + // Invalidate textures in memexported memory and watch for changes. + for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) { + shared_memory_->RangeWrittenByGpu(memexport_range.base_address_dwords << 2, + memexport_range.size_bytes, false); + } + return true; } diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index 8e1df02ef..022fb37b2 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -737,6 +737,9 @@ class VulkanCommandProcessor : public CommandProcessor { // System shader constants. SpirvShaderTranslator::SystemConstants system_constants_; + + // Temporary storage for memexport stream constants used in the draw. + std::vector memexport_ranges_; }; } // namespace vulkan