From 9fa41c27bc185a6b3a75196a811f6c3709506858 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Tue, 26 Jul 2022 16:01:20 +0300 Subject: [PATCH 1/5] [Vulkan] Point sprite geometry shader --- .../gpu/d3d12/d3d12_command_processor.cc | 80 ++--- src/xenia/gpu/spirv_shader_translator.cc | 206 +++++++++-- src/xenia/gpu/spirv_shader_translator.h | 23 +- .../gpu/vulkan/vulkan_command_processor.cc | 43 +++ src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc | 331 +++++++++++++++++- src/xenia/gpu/vulkan/vulkan_pipeline_cache.h | 2 + 6 files changed, 591 insertions(+), 94 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index b92a61d77..b6f72ff9b 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -3160,8 +3160,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( const RegisterFile& regs = *register_file_; auto pa_cl_clip_cntl = regs.Get(); auto pa_cl_vte_cntl = regs.Get(); - auto pa_su_point_minmax = regs.Get(); - auto pa_su_point_size = regs.Get(); auto pa_su_sc_mode_cntl = regs.Get(); float rb_alpha_ref = regs[XE_GPU_REG_RB_ALPHA_REF].f32; auto rb_colorcontrol = regs.Get(); @@ -3365,43 +3363,47 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( } // Point size. - float point_vertex_diameter_min = - float(pa_su_point_minmax.min_size) * (2.0f / 16.0f); - float point_vertex_diameter_max = - float(pa_su_point_minmax.max_size) * (2.0f / 16.0f); - float point_constant_diameter_x = - float(pa_su_point_size.width) * (2.0f / 16.0f); - float point_constant_diameter_y = - float(pa_su_point_size.height) * (2.0f / 16.0f); - dirty |= - system_constants_.point_vertex_diameter_min != point_vertex_diameter_min; - dirty |= - system_constants_.point_vertex_diameter_max != point_vertex_diameter_max; - dirty |= - system_constants_.point_constant_diameter[0] != point_constant_diameter_x; - dirty |= - system_constants_.point_constant_diameter[1] != point_constant_diameter_y; - system_constants_.point_vertex_diameter_min = point_vertex_diameter_min; - system_constants_.point_vertex_diameter_max = point_vertex_diameter_max; - system_constants_.point_constant_diameter[0] = point_constant_diameter_x; - system_constants_.point_constant_diameter[1] = point_constant_diameter_y; - // 2 because 1 in the NDC is half of the viewport's axis, 0.5 for diameter to - // radius conversion to avoid multiplying the per-vertex diameter by an - // additional constant in the shader. - float point_screen_diameter_to_ndc_radius_x = - (/* 0.5f * 2.0f * */ float(draw_resolution_scale_x)) / - std::max(viewport_info.xy_extent[0], uint32_t(1)); - float point_screen_diameter_to_ndc_radius_y = - (/* 0.5f * 2.0f * */ float(draw_resolution_scale_y)) / - std::max(viewport_info.xy_extent[1], uint32_t(1)); - dirty |= system_constants_.point_screen_diameter_to_ndc_radius[0] != - point_screen_diameter_to_ndc_radius_x; - dirty |= system_constants_.point_screen_diameter_to_ndc_radius[1] != - point_screen_diameter_to_ndc_radius_y; - system_constants_.point_screen_diameter_to_ndc_radius[0] = - point_screen_diameter_to_ndc_radius_x; - system_constants_.point_screen_diameter_to_ndc_radius[1] = - point_screen_diameter_to_ndc_radius_y; + if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) { + auto pa_su_point_minmax = regs.Get(); + auto pa_su_point_size = regs.Get(); + float point_vertex_diameter_min = + float(pa_su_point_minmax.min_size) * (2.0f / 16.0f); + float point_vertex_diameter_max = + float(pa_su_point_minmax.max_size) * (2.0f / 16.0f); + float point_constant_diameter_x = + float(pa_su_point_size.width) * (2.0f / 16.0f); + float point_constant_diameter_y = + float(pa_su_point_size.height) * (2.0f / 16.0f); + dirty |= system_constants_.point_vertex_diameter_min != + point_vertex_diameter_min; + dirty |= system_constants_.point_vertex_diameter_max != + point_vertex_diameter_max; + dirty |= system_constants_.point_constant_diameter[0] != + point_constant_diameter_x; + dirty |= system_constants_.point_constant_diameter[1] != + point_constant_diameter_y; + system_constants_.point_vertex_diameter_min = point_vertex_diameter_min; + system_constants_.point_vertex_diameter_max = point_vertex_diameter_max; + system_constants_.point_constant_diameter[0] = point_constant_diameter_x; + system_constants_.point_constant_diameter[1] = point_constant_diameter_y; + // 2 because 1 in the NDC is half of the viewport's axis, 0.5 for diameter + // to radius conversion to avoid multiplying the per-vertex diameter by an + // additional constant in the shader. + float point_screen_diameter_to_ndc_radius_x = + (/* 0.5f * 2.0f * */ float(draw_resolution_scale_x)) / + std::max(viewport_info.xy_extent[0], uint32_t(1)); + float point_screen_diameter_to_ndc_radius_y = + (/* 0.5f * 2.0f * */ float(draw_resolution_scale_y)) / + std::max(viewport_info.xy_extent[1], uint32_t(1)); + dirty |= system_constants_.point_screen_diameter_to_ndc_radius[0] != + point_screen_diameter_to_ndc_radius_x; + dirty |= system_constants_.point_screen_diameter_to_ndc_radius[1] != + point_screen_diameter_to_ndc_radius_y; + system_constants_.point_screen_diameter_to_ndc_radius[0] = + point_screen_diameter_to_ndc_radius_x; + system_constants_.point_screen_diameter_to_ndc_radius[1] = + point_screen_diameter_to_ndc_radius_y; + } // Texture signedness / gamma. bool gamma_render_target_as_srgb = diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index f7dc2c1f3..452ac1450 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -106,16 +106,19 @@ void SpirvShaderTranslator::Reset() { uniform_float_constants_ = spv::NoResult; - input_fragment_coord_ = spv::NoResult; + input_point_coordinates_ = spv::NoResult; + input_fragment_coordinates_ = spv::NoResult; input_front_facing_ = spv::NoResult; std::fill(input_output_interpolators_.begin(), input_output_interpolators_.end(), spv::NoResult); + output_point_size_ = spv::NoResult; sampler_bindings_.clear(); texture_bindings_.clear(); main_interface_.clear(); var_main_registers_ = spv::NoResult; + var_main_point_size_edge_flag_kill_vertex_ = spv::NoResult; main_switch_op_.reset(); main_switch_next_pc_phi_operands_.clear(); @@ -230,7 +233,16 @@ void SpirvShaderTranslator::StartTranslation() { {"vertex_base_index", offsetof(SystemConstants, vertex_base_index), type_int_}, {"ndc_scale", offsetof(SystemConstants, ndc_scale), type_float3_}, + {"point_vertex_diameter_min", + offsetof(SystemConstants, point_vertex_diameter_min), type_float_}, {"ndc_offset", offsetof(SystemConstants, ndc_offset), type_float3_}, + {"point_vertex_diameter_max", + offsetof(SystemConstants, point_vertex_diameter_max), type_float_}, + {"point_constant_diameter", + offsetof(SystemConstants, point_constant_diameter), type_float2_}, + {"point_screen_diameter_to_ndc_radius", + offsetof(SystemConstants, point_screen_diameter_to_ndc_radius), + type_float2_}, {"texture_swizzled_signs", offsetof(SystemConstants, texture_swizzled_signs), type_uint4_array_2}, {"texture_swizzles", offsetof(SystemConstants, texture_swizzles), @@ -1063,9 +1075,10 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderBeforeMain() { main_interface_.push_back(input_vertex_index_); } + uint32_t output_location = 0; + // Create the interpolator outputs. { - uint32_t interpolator_location = 0; uint32_t interpolators_remaining = GetModificationInterpolatorMask(); uint32_t interpolator_index; while (xe::bit_scan_forward(interpolators_remaining, &interpolator_index)) { @@ -1075,13 +1088,29 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderBeforeMain() { fmt::format("xe_out_interpolator_{}", interpolator_index).c_str()); input_output_interpolators_[interpolator_index] = interpolator; builder_->addDecoration(interpolator, spv::DecorationLocation, - int(interpolator_location)); + int(output_location)); builder_->addDecoration(interpolator, spv::DecorationInvariant); main_interface_.push_back(interpolator); - ++interpolator_location; + ++output_location; } } + Modification shader_modification = GetSpirvShaderModification(); + + // Create the point size output. Not using gl_PointSize from gl_PerVertex not + // to rely on the shaderTessellationAndGeometryPointSize feature, and also + // because the value written to gl_PointSize must be greater than zero. + if (shader_modification.vertex.output_point_size) { + output_point_size_ = + builder_->createVariable(spv::NoPrecision, spv::StorageClassOutput, + type_float_, "xe_out_point_size"); + builder_->addDecoration(output_point_size_, spv::DecorationLocation, + int(output_location)); + builder_->addDecoration(output_point_size_, spv::DecorationInvariant); + main_interface_.push_back(output_point_size_); + ++output_location; + } + // Create the gl_PerVertex output for used system outputs. std::vector struct_per_vertex_members; struct_per_vertex_members.reserve(kOutputPerVertexMemberCount); @@ -1103,9 +1132,23 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderBeforeMain() { } void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { - var_main_point_size_edge_flag_kill_vertex_ = builder_->createVariable( - spv::NoPrecision, spv::StorageClassFunction, type_float3_, - "xe_var_point_size_edge_flag_kill_vertex"); + // The edge flag isn't used for any purpose by the translator. + if (current_shader().writes_point_size_edge_flag_kill_vertex() & 0b101) { + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + // Set the point size to a negative value to tell the point sprite expansion + // that it should use the default point size if the vertex shader does not + // override it. + id_vector_temp_.push_back(builder_->makeFloatConstant(-1.0f)); + // The edge flag is ignored. + id_vector_temp_.push_back(const_float_0_); + // Don't kill by default (zero bits 0:30). + id_vector_temp_.push_back(const_float_0_); + var_main_point_size_edge_flag_kill_vertex_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float3_, + "xe_var_point_size_edge_flag_kill_vertex", + builder_->makeCompositeConstant(type_float3_, id_vector_temp_)); + } // Zero general-purpose registers to prevent crashes when the game // references them after only initializing them conditionally. @@ -1352,13 +1395,35 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() { std::move(composite_construct_op)); } builder_->createStore(position, position_ptr); + + // Write the point size. + if (output_point_size_ != spv::NoResult) { + spv::Id point_size; + if (current_shader().writes_point_size_edge_flag_kill_vertex() & 0b001) { + assert_true(var_main_point_size_edge_flag_kill_vertex_ != spv::NoResult); + id_vector_temp_.clear(); + // X vector component. + id_vector_temp_.push_back(const_int_0_); + point_size = builder_->createLoad( + builder_->createAccessChain( + spv::StorageClassFunction, + var_main_point_size_edge_flag_kill_vertex_, id_vector_temp_), + spv::NoPrecision); + } else { + // Not statically overridden - write a negative value. + point_size = builder_->makeFloatConstant(-1.0f); + } + builder_->createStore(point_size, output_point_size_); + } } void SpirvShaderTranslator::StartFragmentShaderBeforeMain() { - // Interpolator inputs. Modification shader_modification = GetSpirvShaderModification(); + + uint32_t input_location = 0; + + // Interpolator inputs. { - uint32_t interpolator_location = 0; uint32_t interpolators_remaining = GetModificationInterpolatorMask(); uint32_t interpolator_index; while (xe::bit_scan_forward(interpolators_remaining, &interpolator_index)) { @@ -1368,28 +1433,41 @@ void SpirvShaderTranslator::StartFragmentShaderBeforeMain() { fmt::format("xe_in_interpolator_{}", interpolator_index).c_str()); input_output_interpolators_[interpolator_index] = interpolator; builder_->addDecoration(interpolator, spv::DecorationLocation, - int(interpolator_location)); + int(input_location)); if (shader_modification.pixel.interpolators_centroid & (UINT32_C(1) << interpolator_index)) { builder_->addDecoration(interpolator, spv::DecorationCentroid); } main_interface_.push_back(interpolator); - ++interpolator_location; + ++input_location; } } bool param_gen_needed = GetPsParamGenInterpolator() != UINT32_MAX; + // Point coordinate input. + if (shader_modification.pixel.param_gen_point) { + if (param_gen_needed) { + input_point_coordinates_ = + builder_->createVariable(spv::NoPrecision, spv::StorageClassInput, + type_float2_, "xe_in_point_coordinates"); + builder_->addDecoration(input_point_coordinates_, spv::DecorationLocation, + int(input_location)); + main_interface_.push_back(input_point_coordinates_); + } + ++input_location; + } + // Fragment coordinates. // TODO(Triang3l): More conditions - fragment shader interlock render backend, // alpha to coverage (if RT 0 is written, and there's no early depth / // stencil), depth writing in the fragment shader (per-sample if supported). if (param_gen_needed) { - input_fragment_coord_ = builder_->createVariable( + input_fragment_coordinates_ = builder_->createVariable( spv::NoPrecision, spv::StorageClassInput, type_float4_, "gl_FragCoord"); - builder_->addDecoration(input_fragment_coord_, spv::DecorationBuiltIn, + builder_->addDecoration(input_fragment_coordinates_, spv::DecorationBuiltIn, spv::BuiltInFragCoord); - main_interface_.push_back(input_fragment_coord_); + main_interface_.push_back(input_fragment_coordinates_); } // Is front facing. @@ -1473,13 +1551,14 @@ void SpirvShaderTranslator::StartFragmentShaderInMain() { spv::Id const_sign_bit = builder_->makeUintConstant(UINT32_C(1) << 31); // TODO(Triang3l): Resolution scale inversion. // X - pixel X .0 in the magnitude, is back-facing in the sign bit. - assert_true(input_fragment_coord_ != spv::NoResult); + assert_true(input_fragment_coordinates_ != spv::NoResult); id_vector_temp_.clear(); id_vector_temp_.push_back(const_int_0_); - spv::Id param_gen_x = builder_->createLoad( - builder_->createAccessChain(spv::StorageClassInput, - input_fragment_coord_, id_vector_temp_), - spv::NoPrecision); + spv::Id param_gen_x = + builder_->createLoad(builder_->createAccessChain( + spv::StorageClassInput, + input_fragment_coordinates_, id_vector_temp_), + spv::NoPrecision); id_vector_temp_.clear(); id_vector_temp_.push_back(param_gen_x); param_gen_x = builder_->createBuiltinCall( @@ -1514,10 +1593,11 @@ void SpirvShaderTranslator::StartFragmentShaderInMain() { // Y - pixel Y .0 in the magnitude, is point in the sign bit. id_vector_temp_.clear(); id_vector_temp_.push_back(builder_->makeIntConstant(1)); - spv::Id param_gen_y = builder_->createLoad( - builder_->createAccessChain(spv::StorageClassInput, - input_fragment_coord_, id_vector_temp_), - spv::NoPrecision); + spv::Id param_gen_y = + builder_->createLoad(builder_->createAccessChain( + spv::StorageClassInput, + input_fragment_coordinates_, id_vector_temp_), + spv::NoPrecision); id_vector_temp_.clear(); id_vector_temp_.push_back(param_gen_y); param_gen_y = builder_->createBuiltinCall( @@ -1535,10 +1615,16 @@ void SpirvShaderTranslator::StartFragmentShaderInMain() { const_sign_bit)); } // Z - point S in the magnitude, is line in the sign bit. - spv::Id param_gen_z; + // W - point T in the magnitude. + spv::Id param_gen_z, param_gen_w; if (modification.pixel.param_gen_point) { - // TODO(Triang3l): Point coordinates. - param_gen_z = const_float_0_; + assert_true(input_point_coordinates_ != spv::NoResult); + spv::Id param_gen_point_coordinates = + builder_->createLoad(input_point_coordinates_, spv::NoPrecision); + param_gen_z = builder_->createCompositeExtract( + param_gen_point_coordinates, type_float_, 0); + param_gen_w = builder_->createCompositeExtract( + param_gen_point_coordinates, type_float_, 1); } else { param_gen_z = builder_->createUnaryOp( spv::OpBitcast, type_float_, @@ -1552,10 +1638,8 @@ void SpirvShaderTranslator::StartFragmentShaderInMain() { builder_->makeUintConstant(kSysFlag_PrimitiveLine)), const_uint_0_), const_sign_bit, const_uint_0_)); + param_gen_w = const_float_0_; } - // W - point T in the magnitude. - // TODO(Triang3l): Point coordinates. - spv::Id param_gen_w = const_float_0_; // Store the pixel parameters. id_vector_temp_.clear(); id_vector_temp_.reserve(4); @@ -1927,15 +2011,20 @@ void SpirvShaderTranslator::StoreResult(const InstructionResult& result, target_pointer = input_output_interpolators_[result.storage_index]; // Unused interpolators are spv::NoResult in input_output_interpolators_. } break; - case InstructionStorageTarget::kPosition: + case InstructionStorageTarget::kPosition: { assert_true(is_vertex_shader()); id_vector_temp_util_.clear(); id_vector_temp_util_.push_back( builder_->makeIntConstant(kOutputPerVertexMemberPosition)); target_pointer = builder_->createAccessChain( spv::StorageClassOutput, output_per_vertex_, id_vector_temp_util_); - break; - case InstructionStorageTarget::kColor: + } break; + case InstructionStorageTarget::kPointSizeEdgeFlagKillVertex: { + assert_true(is_vertex_shader()); + assert_zero(used_write_mask & 0b1000); + target_pointer = var_main_point_size_edge_flag_kill_vertex_; + } break; + case InstructionStorageTarget::kColor: { assert_true(is_pixel_shader()); assert_not_zero(used_write_mask); assert_true(current_shader().writes_color_target(result.storage_index)); @@ -1944,7 +2033,7 @@ void SpirvShaderTranslator::StoreResult(const InstructionResult& result, // an empty write mask without independent blending. // TODO(Triang3l): Store the alpha of the first output in this case for // alpha test and alpha to coverage. - break; + } break; default: // TODO(Triang3l): All storage targets. break; @@ -2179,6 +2268,57 @@ void SpirvShaderTranslator::StoreResult(const InstructionResult& result, } } } + + if (result.storage_target == + InstructionStorageTarget::kPointSizeEdgeFlagKillVertex && + used_write_mask & 0b001) { + // Make the point size non-negative as negative is used to indicate that the + // default size must be used, and also clamp it to the bounds the way the + // R400 (Adreno 200, to be more precise) hardware clamps it (functionally + // like a signed 32-bit integer, -NaN and -Infinity...-0 to the minimum, + // +NaN to the maximum). + spv::Id point_size = builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createCompositeExtract(value_to_store, type_float_, 0)); + id_vector_temp_util_.clear(); + id_vector_temp_util_.push_back( + builder_->makeIntConstant(kSystemConstantPointVertexDiameterMin)); + spv::Id point_vertex_diameter_min = builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_util_), + spv::NoPrecision)); + id_vector_temp_util_.clear(); + id_vector_temp_util_.reserve(2); + id_vector_temp_util_.push_back(point_vertex_diameter_min); + id_vector_temp_util_.push_back(point_size); + point_size = + builder_->createBuiltinCall(type_int_, ext_inst_glsl_std_450_, + GLSLstd450SMax, id_vector_temp_util_); + id_vector_temp_util_.clear(); + id_vector_temp_util_.push_back( + builder_->makeIntConstant(kSystemConstantPointVertexDiameterMax)); + spv::Id point_vertex_diameter_max = builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_util_), + spv::NoPrecision)); + id_vector_temp_util_.clear(); + id_vector_temp_util_.reserve(2); + id_vector_temp_util_.push_back(point_vertex_diameter_max); + id_vector_temp_util_.push_back(point_size); + point_size = + builder_->createBuiltinCall(type_int_, ext_inst_glsl_std_450_, + GLSLstd450SMin, id_vector_temp_util_); + value_to_store = builder_->createCompositeInsert( + builder_->createUnaryOp(spv::OpBitcast, type_float_, point_size), + value_to_store, type_float3_, 0); + } + builder_->createStore(value_to_store, target_pointer); } diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 69d05d95c..733bbf2ff 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -34,7 +34,7 @@ class SpirvShaderTranslator : public ShaderTranslator { // TODO(Triang3l): Change to 0xYYYYMMDD once it's out of the rapid // prototyping stage (easier to do small granular updates with an // incremental counter). - static constexpr uint32_t kVersion = 5; + static constexpr uint32_t kVersion = 6; enum class DepthStencilMode : uint32_t { kNoModifiers, @@ -50,6 +50,7 @@ class SpirvShaderTranslator : public ShaderTranslator { // Interpolators written by the vertex shader and needed by the pixel // shader. uint32_t interpolator_mask : xenos::kMaxInterpolators; + uint32_t output_point_size : 1; // Dynamically indexable register count from SQ_PROGRAM_CNTL. uint32_t dynamic_addressable_register_count : 8; // Pipeline stage and input configuration. @@ -145,10 +146,15 @@ class SpirvShaderTranslator : public ShaderTranslator { int32_t vertex_base_index; float ndc_scale[3]; - uint32_t padding_ndc_scale; + float point_vertex_diameter_min; float ndc_offset[3]; - uint32_t padding_ndc_offset; + float point_vertex_diameter_max; + + float point_constant_diameter[2]; + // Diameter in guest screen coordinates > radius (0.5 * diameter) in the NDC + // for the host viewport. + float point_screen_diameter_to_ndc_radius[2]; // Each byte contains post-swizzle TextureSign values for each of the needed // components of each of the 32 used texture fetch constants. @@ -603,7 +609,11 @@ class SpirvShaderTranslator : public ShaderTranslator { kSystemConstantVertexIndexEndian, kSystemConstantVertexBaseIndex, kSystemConstantNdcScale, + kSystemConstantPointVertexDiameterMin, kSystemConstantNdcOffset, + kSystemConstantPointVertexDiameterMax, + kSystemConstantPointConstantDiameter, + kSystemConstantPointScreenDiameterToNdcRadius, kSystemConstantTextureSwizzledSigns, kSystemConstantTextureSwizzles, kSystemConstantAlphaTestReference, @@ -627,8 +637,10 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id input_vertex_index_; // VS as TES only - int. spv::Id input_primitive_id_; + // PS, only when needed - float2. + spv::Id input_point_coordinates_; // PS, only when needed - float4. - spv::Id input_fragment_coord_; + spv::Id input_fragment_coordinates_; // PS, only when needed - bool. spv::Id input_front_facing_; @@ -643,6 +655,9 @@ class SpirvShaderTranslator : public ShaderTranslator { // all). std::array input_output_interpolators_; + // VS, only when needed - float. + spv::Id output_point_size_; + enum OutputPerVertexMember : unsigned int { kOutputPerVertexMemberPosition, kOutputPerVertexMemberCount, diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 3c4422561..80affe639 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -3482,6 +3482,49 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( system_constants_.ndc_offset[i] = viewport_info.ndc_offset[i]; } + // Point size. + if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) { + auto pa_su_point_minmax = regs.Get(); + auto pa_su_point_size = regs.Get(); + float point_vertex_diameter_min = + float(pa_su_point_minmax.min_size) * (2.0f / 16.0f); + float point_vertex_diameter_max = + float(pa_su_point_minmax.max_size) * (2.0f / 16.0f); + float point_constant_diameter_x = + float(pa_su_point_size.width) * (2.0f / 16.0f); + float point_constant_diameter_y = + float(pa_su_point_size.height) * (2.0f / 16.0f); + dirty |= system_constants_.point_vertex_diameter_min != + point_vertex_diameter_min; + dirty |= system_constants_.point_vertex_diameter_max != + point_vertex_diameter_max; + dirty |= system_constants_.point_constant_diameter[0] != + point_constant_diameter_x; + dirty |= system_constants_.point_constant_diameter[1] != + point_constant_diameter_y; + system_constants_.point_vertex_diameter_min = point_vertex_diameter_min; + system_constants_.point_vertex_diameter_max = point_vertex_diameter_max; + system_constants_.point_constant_diameter[0] = point_constant_diameter_x; + system_constants_.point_constant_diameter[1] = point_constant_diameter_y; + // 2 because 1 in the NDC is half of the viewport's axis, 0.5 for diameter + // to radius conversion to avoid multiplying the per-vertex diameter by an + // additional constant in the shader. + float point_screen_diameter_to_ndc_radius_x = + (/* 0.5f * 2.0f * */ float(texture_cache_->draw_resolution_scale_x())) / + std::max(viewport_info.xy_extent[0], uint32_t(1)); + float point_screen_diameter_to_ndc_radius_y = + (/* 0.5f * 2.0f * */ float(texture_cache_->draw_resolution_scale_y())) / + std::max(viewport_info.xy_extent[1], uint32_t(1)); + dirty |= system_constants_.point_screen_diameter_to_ndc_radius[0] != + point_screen_diameter_to_ndc_radius_x; + dirty |= system_constants_.point_screen_diameter_to_ndc_radius[1] != + point_screen_diameter_to_ndc_radius_y; + system_constants_.point_screen_diameter_to_ndc_radius[0] = + point_screen_diameter_to_ndc_radius_x; + system_constants_.point_screen_diameter_to_ndc_radius[1] = + point_screen_diameter_to_ndc_radius_y; + } + // Texture signedness / gamma. { uint32_t textures_remaining = used_texture_mask; diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc index df7156b08..7cf30e250 100644 --- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc @@ -133,6 +133,11 @@ VulkanPipelineCache::GetCurrentVertexShaderModification( modification.vertex.interpolator_mask = interpolator_mask; + modification.vertex.output_point_size = + uint32_t((shader.writes_point_size_edge_flag_kill_vertex() & 0b001) && + regs.Get().prim_type == + xenos::PrimitiveType::kPointList); + return modification; } @@ -284,6 +289,8 @@ bool VulkanPipelineCache::ConfigurePipeline( if (GetGeometryShaderKey( description.geometry_shader, SpirvShaderTranslator::Modification(vertex_shader->modification()), + SpirvShaderTranslator::Modification( + pixel_shader ? pixel_shader->modification() : 0), geometry_shader_key)) { geometry_shader = GetGeometryShader(geometry_shader_key); if (geometry_shader == VK_NULL_HANDLE) { @@ -496,6 +503,7 @@ bool VulkanPipelineCache::GetCurrentStateDescription( PipelinePrimitiveTopology primitive_topology; switch (primitive_processing_result.host_primitive_type) { case xenos::PrimitiveType::kPointList: + geometry_shader = PipelineGeometryShader::kPointList; primitive_topology = PipelinePrimitiveTopology::kPointList; break; case xenos::PrimitiveType::kLineList: @@ -815,6 +823,7 @@ bool VulkanPipelineCache::ArePipelineRequirementsMet( bool VulkanPipelineCache::GetGeometryShaderKey( PipelineGeometryShader geometry_shader_type, SpirvShaderTranslator::Modification vertex_shader_modification, + SpirvShaderTranslator::Modification pixel_shader_modification, GeometryShaderKey& key_out) { if (geometry_shader_type == PipelineGeometryShader::kNone) { return false; @@ -831,10 +840,8 @@ bool VulkanPipelineCache::GetGeometryShaderKey( /* vertex_shader_modification.vertex.user_clip_plane_cull */ 0; key.has_vertex_kill_and = /* vertex_shader_modification.vertex.vertex_kill_and */ 0; - key.has_point_size = - /* vertex_shader_modification.vertex.output_point_size */ 0; - key.has_point_coordinates = - /* pixel_shader_modification.pixel.param_gen_point */ 0; + key.has_point_size = vertex_shader_modification.vertex.output_point_size; + key.has_point_coordinates = pixel_shader_modification.pixel.param_gen_point; key_out = key; return true; } @@ -853,6 +860,13 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) { spv::ExecutionMode output_primitive_execution_mode = spv::ExecutionMode(0); uint32_t output_max_vertices = 0; switch (key.type) { + case PipelineGeometryShader::kPointList: + // Point to a strip of 2 triangles. + input_primitive_execution_mode = spv::ExecutionModeInputPoints; + input_primitive_vertex_count = 1; + output_primitive_execution_mode = spv::ExecutionModeOutputTriangleStrip; + output_max_vertices = 4; + break; case PipelineGeometryShader::kRectangleList: // Triangle to a strip of 2 triangles. input_primitive_execution_mode = spv::ExecutionModeTriangles; @@ -901,6 +915,7 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) { spv::Id type_bool4 = builder.makeVectorType(type_bool, 4); spv::Id type_int = builder.makeIntType(32); spv::Id type_float = builder.makeFloatType(32); + spv::Id type_float2 = builder.makeVectorType(type_float, 2); spv::Id type_float4 = builder.makeVectorType(type_float, 4); spv::Id type_clip_distances = clip_distance_count @@ -912,9 +927,54 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) { ? builder.makeArrayType( type_float, builder.makeUintConstant(cull_distance_count), 0) : spv::NoType; - spv::Id type_point_coordinates = key.has_point_coordinates - ? builder.makeVectorType(type_float, 2) - : spv::NoType; + + // System constants. + // For points: + // - float2 point_constant_diameter + // - float2 point_screen_diameter_to_ndc_radius + enum PointConstant : uint32_t { + kPointConstantConstantDiameter, + kPointConstantScreenDiameterToNdcRadius, + kPointConstantCount, + }; + spv::Id type_system_constants = spv::NoType; + if (key.type == PipelineGeometryShader::kPointList) { + id_vector_temp.clear(); + id_vector_temp.resize(kPointConstantCount); + id_vector_temp[kPointConstantConstantDiameter] = type_float2; + id_vector_temp[kPointConstantScreenDiameterToNdcRadius] = type_float2; + type_system_constants = + builder.makeStructType(id_vector_temp, "XeSystemConstants"); + builder.addMemberName(type_system_constants, kPointConstantConstantDiameter, + "point_constant_diameter"); + builder.addMemberDecoration( + type_system_constants, kPointConstantConstantDiameter, + spv::DecorationOffset, + int(offsetof(SpirvShaderTranslator::SystemConstants, + point_constant_diameter))); + builder.addMemberName(type_system_constants, + kPointConstantScreenDiameterToNdcRadius, + "point_screen_diameter_to_ndc_radius"); + builder.addMemberDecoration( + type_system_constants, kPointConstantScreenDiameterToNdcRadius, + spv::DecorationOffset, + int(offsetof(SpirvShaderTranslator::SystemConstants, + point_screen_diameter_to_ndc_radius))); + } + spv::Id uniform_system_constants = spv::NoResult; + if (type_system_constants != spv::NoType) { + builder.addDecoration(type_system_constants, spv::DecorationBlock); + uniform_system_constants = builder.createVariable( + spv::NoPrecision, spv::StorageClassUniform, type_system_constants, + "xe_uniform_system_constants"); + builder.addDecoration(uniform_system_constants, + spv::DecorationDescriptorSet, + int(SpirvShaderTranslator::kDescriptorSetConstants)); + builder.addDecoration(uniform_system_constants, spv::DecorationBinding, + int(SpirvShaderTranslator::kConstantBufferSystem)); + // Generating SPIR-V 1.0, no need to add bindings to the entry point's + // interface until SPIR-V 1.4. + } // Inputs and outputs - matching glslang order, in gl_PerVertex gl_in[], // user-defined outputs, user-defined inputs, out gl_PerVertex. @@ -977,6 +1037,8 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) { type_array_in_gl_per_vertex, "gl_in"); main_interface.push_back(in_gl_per_vertex); + uint32_t output_location = 0; + // Interpolators outputs. std::array out_interpolators; for (uint32_t i = 0; i < key.interpolator_count; ++i) { @@ -984,23 +1046,28 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) { spv::NoPrecision, spv::StorageClassOutput, type_float4, fmt::format("xe_out_interpolator_{}", i).c_str()); out_interpolators[i] = out_interpolator; - builder.addDecoration(out_interpolator, spv::DecorationLocation, i); + builder.addDecoration(out_interpolator, spv::DecorationLocation, + int(output_location)); builder.addDecoration(out_interpolator, spv::DecorationInvariant); main_interface.push_back(out_interpolator); + ++output_location; } // Point coordinate output. spv::Id out_point_coordinates = spv::NoResult; if (key.has_point_coordinates) { - out_point_coordinates = builder.createVariable( - spv::NoPrecision, spv::StorageClassOutput, type_point_coordinates, - "xe_out_point_coordinates"); + out_point_coordinates = + builder.createVariable(spv::NoPrecision, spv::StorageClassOutput, + type_float2, "xe_out_point_coordinates"); builder.addDecoration(out_point_coordinates, spv::DecorationLocation, - key.interpolator_count); + int(output_location)); builder.addDecoration(out_point_coordinates, spv::DecorationInvariant); main_interface.push_back(out_point_coordinates); + ++output_location; } + uint32_t input_location = 0; + // Interpolator inputs. std::array in_interpolators; for (uint32_t i = 0; i < key.interpolator_count; ++i) { @@ -1010,8 +1077,10 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) { 0), fmt::format("xe_in_interpolator_{}", i).c_str()); in_interpolators[i] = in_interpolator; - builder.addDecoration(in_interpolator, spv::DecorationLocation, i); + builder.addDecoration(in_interpolator, spv::DecorationLocation, + int(input_location)); main_interface.push_back(in_interpolator); + ++input_location; } // Point size input. @@ -1023,8 +1092,9 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) { 0), "xe_in_point_size"); builder.addDecoration(in_point_size, spv::DecorationLocation, - key.interpolator_count); + int(input_location)); main_interface.push_back(in_point_size); + ++input_location; } // out gl_PerVertex. @@ -1198,6 +1268,231 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) { } switch (key.type) { + case PipelineGeometryShader::kPointList: { + // Expand the point sprite, with left-to-right, top-to-bottom UVs. + + spv::Id const_int_0 = builder.makeIntConstant(0); + spv::Id const_int_1 = builder.makeIntConstant(1); + spv::Id const_float_0 = builder.makeFloatConstant(0.0f); + + // Load the point diameter in guest pixels. + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back( + builder.makeIntConstant(int32_t(kPointConstantConstantDiameter))); + id_vector_temp.push_back(const_int_0); + spv::Id point_guest_diameter_x = builder.createLoad( + builder.createAccessChain(spv::StorageClassUniform, + uniform_system_constants, id_vector_temp), + spv::NoPrecision); + id_vector_temp.back() = const_int_1; + spv::Id point_guest_diameter_y = builder.createLoad( + builder.createAccessChain(spv::StorageClassUniform, + uniform_system_constants, id_vector_temp), + spv::NoPrecision); + if (key.has_point_size) { + // The vertex shader's header writes -1.0 to point_size by default, so + // any non-negative value means that it was overwritten by the + // translated vertex shader, and needs to be used instead of the + // constant size. The per-vertex diameter is already clamped in the + // vertex shader (combined with making it non-negative). + id_vector_temp.clear(); + // 0 is the input primitive vertex index. + id_vector_temp.push_back(const_int_0); + spv::Id point_vertex_diameter = builder.createLoad( + builder.createAccessChain(spv::StorageClassInput, in_point_size, + id_vector_temp), + spv::NoPrecision); + spv::Id point_vertex_diameter_written = + builder.createBinOp(spv::OpFOrdGreaterThanEqual, type_bool, + point_vertex_diameter, const_float_0); + point_guest_diameter_x = builder.createTriOp( + spv::OpSelect, type_float, point_vertex_diameter_written, + point_vertex_diameter, point_guest_diameter_x); + point_guest_diameter_y = builder.createTriOp( + spv::OpSelect, type_float, point_vertex_diameter_written, + point_vertex_diameter, point_guest_diameter_y); + } + + // 4D5307F1 has zero-size snowflakes, drop them quicker, and also drop + // points with a constant size of zero since point lists may also be used + // as just "compute" with memexport. + spv::Id point_size_not_zero = builder.createBinOp( + spv::OpLogicalAnd, type_bool, + builder.createBinOp(spv::OpFOrdGreaterThan, type_bool, + point_guest_diameter_x, const_float_0), + builder.createBinOp(spv::OpFOrdGreaterThan, type_bool, + point_guest_diameter_y, const_float_0)); + spv::Block& point_size_zero_predecessor = *builder.getBuildPoint(); + spv::Block& point_size_zero_then_block = builder.makeNewBlock(); + spv::Block& point_size_zero_merge_block = builder.makeNewBlock(); + { + std::unique_ptr selection_merge_op( + std::make_unique(spv::OpSelectionMerge)); + selection_merge_op->addIdOperand(point_size_zero_merge_block.getId()); + selection_merge_op->addImmediateOperand( + spv::SelectionControlDontFlattenMask); + point_size_zero_predecessor.addInstruction( + std::move(selection_merge_op)); + } + { + std::unique_ptr branch_conditional_op( + std::make_unique(spv::OpBranchConditional)); + branch_conditional_op->addIdOperand(point_size_not_zero); + branch_conditional_op->addIdOperand( + point_size_zero_merge_block.getId()); + branch_conditional_op->addIdOperand(point_size_zero_then_block.getId()); + branch_conditional_op->addImmediateOperand(2); + branch_conditional_op->addImmediateOperand(1); + point_size_zero_predecessor.addInstruction( + std::move(branch_conditional_op)); + } + point_size_zero_then_block.addPredecessor(&point_size_zero_predecessor); + point_size_zero_merge_block.addPredecessor(&point_size_zero_predecessor); + builder.setBuildPoint(&point_size_zero_then_block); + builder.createNoResultOp(spv::OpReturn); + builder.setBuildPoint(&point_size_zero_merge_block); + + // Transform the diameter in the guest screen coordinates to radius in the + // normalized device coordinates, and then to the clip space by + // multiplying by W. + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back(builder.makeIntConstant( + int32_t(kPointConstantScreenDiameterToNdcRadius))); + id_vector_temp.push_back(const_int_0); + spv::Id point_radius_x = builder.createBinOp( + spv::OpFMul, type_float, point_guest_diameter_x, + builder.createLoad(builder.createAccessChain(spv::StorageClassUniform, + uniform_system_constants, + id_vector_temp), + spv::NoPrecision)); + builder.addDecoration(point_radius_x, spv::DecorationNoContraction); + id_vector_temp.back() = const_int_1; + spv::Id point_radius_y = builder.createBinOp( + spv::OpFMul, type_float, point_guest_diameter_y, + builder.createLoad(builder.createAccessChain(spv::StorageClassUniform, + uniform_system_constants, + id_vector_temp), + spv::NoPrecision)); + builder.addDecoration(point_radius_y, spv::DecorationNoContraction); + id_vector_temp.clear(); + id_vector_temp.reserve(2); + // 0 is the input primitive vertex index. + id_vector_temp.push_back(const_int_0); + id_vector_temp.push_back(const_member_in_gl_per_vertex_position); + spv::Id point_position = builder.createLoad( + builder.createAccessChain(spv::StorageClassInput, in_gl_per_vertex, + id_vector_temp), + spv::NoPrecision); + spv::Id point_w = + builder.createCompositeExtract(point_position, type_float, 3); + point_radius_x = + builder.createBinOp(spv::OpFMul, type_float, point_radius_x, point_w); + builder.addDecoration(point_radius_x, spv::DecorationNoContraction); + point_radius_y = + builder.createBinOp(spv::OpFMul, type_float, point_radius_y, point_w); + builder.addDecoration(point_radius_y, spv::DecorationNoContraction); + + // Load the inputs for the guest point. + // Interpolators. + std::array point_interpolators; + id_vector_temp.clear(); + // 0 is the input primitive vertex index. + id_vector_temp.push_back(const_int_0); + for (uint32_t i = 0; i < key.interpolator_count; ++i) { + point_interpolators[i] = builder.createLoad( + builder.createAccessChain(spv::StorageClassInput, + in_interpolators[i], id_vector_temp), + spv::NoPrecision); + } + // Positions. + spv::Id point_x = + builder.createCompositeExtract(point_position, type_float, 0); + spv::Id point_y = + builder.createCompositeExtract(point_position, type_float, 1); + std::array point_edge_x, point_edge_y; + for (uint32_t i = 0; i < 2; ++i) { + spv::Op point_radius_add_op = i ? spv::OpFAdd : spv::OpFSub; + point_edge_x[i] = builder.createBinOp(point_radius_add_op, type_float, + point_x, point_radius_x); + builder.addDecoration(point_edge_x[i], spv::DecorationNoContraction); + point_edge_y[i] = builder.createBinOp(point_radius_add_op, type_float, + point_y, point_radius_y); + builder.addDecoration(point_edge_y[i], spv::DecorationNoContraction); + }; + spv::Id point_z = + builder.createCompositeExtract(point_position, type_float, 2); + // Clip distances. + spv::Id point_clip_distances = spv::NoResult; + if (clip_distance_count) { + id_vector_temp.clear(); + id_vector_temp.reserve(2); + // 0 is the input primitive vertex index. + id_vector_temp.push_back(const_int_0); + id_vector_temp.push_back(const_member_in_gl_per_vertex_clip_distance); + point_clip_distances = builder.createLoad( + builder.createAccessChain(spv::StorageClassInput, in_gl_per_vertex, + id_vector_temp), + spv::NoPrecision); + } + + for (uint32_t i = 0; i < 4; ++i) { + // Same interpolators for the entire sprite. + for (uint32_t j = 0; j < key.interpolator_count; ++j) { + builder.createStore(point_interpolators[j], out_interpolators[j]); + } + // Top-left, bottom-left, top-right, bottom-right order (chosen + // arbitrarily, simply based on counterclockwise meaning front with + // frontFace = VkFrontFace(0), but faceness is ignored for non-polygon + // primitive types). + uint32_t point_vertex_x = i >> 1; + uint32_t point_vertex_y = i & 1; + // Point coordinates. + if (key.has_point_coordinates) { + id_vector_temp.clear(); + id_vector_temp.reserve(2); + id_vector_temp.push_back( + builder.makeFloatConstant(float(point_vertex_x))); + id_vector_temp.push_back( + builder.makeFloatConstant(float(point_vertex_y))); + builder.createStore( + builder.makeCompositeConstant(type_float2, id_vector_temp), + out_point_coordinates); + } + // Position. + id_vector_temp.clear(); + id_vector_temp.reserve(4); + id_vector_temp.push_back(point_edge_x[point_vertex_x]); + id_vector_temp.push_back(point_edge_y[point_vertex_y]); + id_vector_temp.push_back(point_z); + id_vector_temp.push_back(point_w); + spv::Id point_vertex_position = + builder.createCompositeConstruct(type_float4, id_vector_temp); + id_vector_temp.clear(); + id_vector_temp.push_back(const_member_out_gl_per_vertex_position); + builder.createStore( + point_vertex_position, + builder.createAccessChain(spv::StorageClassOutput, + out_gl_per_vertex, id_vector_temp)); + // Clip distances. + // TODO(Triang3l): Handle ps_ucp_mode properly, clip expanded points if + // needed. + if (clip_distance_count) { + id_vector_temp.clear(); + id_vector_temp.push_back( + const_member_out_gl_per_vertex_clip_distance); + builder.createStore( + point_clip_distances, + builder.createAccessChain(spv::StorageClassOutput, + out_gl_per_vertex, id_vector_temp)); + } + // Emit the vertex. + builder.createNoResultOp(spv::OpEmitVertex); + } + builder.createNoResultOp(spv::OpEndPrimitive); + } break; + case PipelineGeometryShader::kRectangleList: { // Construct a strip with the fourth vertex generated by mirroring a // vertex across the longest edge (the diagonal). @@ -1308,8 +1603,8 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) { id_vector_temp.reserve(2); id_vector_temp.push_back(const_float_0); id_vector_temp.push_back(const_float_0); - const_point_coordinates_zero = builder.makeCompositeConstant( - type_point_coordinates, id_vector_temp); + const_point_coordinates_zero = + builder.makeCompositeConstant(type_float2, id_vector_temp); } // Emit the triangle in the strip that consists of the original vertices. @@ -1491,8 +1786,8 @@ VkShaderModule VulkanPipelineCache::GetGeometryShader(GeometryShaderKey key) { id_vector_temp.reserve(2); id_vector_temp.push_back(const_float_0); id_vector_temp.push_back(const_float_0); - const_point_coordinates_zero = builder.makeCompositeConstant( - type_point_coordinates, id_vector_temp); + const_point_coordinates_zero = + builder.makeCompositeConstant(type_float2, id_vector_temp); } // Build the triangle strip from the original quad vertices in the diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h index e967a1415..6e0c73ab0 100644 --- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h +++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h @@ -92,6 +92,7 @@ class VulkanPipelineCache { private: enum class PipelineGeometryShader : uint32_t { kNone, + kPointList, kRectangleList, kQuadList, }; @@ -267,6 +268,7 @@ class VulkanPipelineCache { static bool GetGeometryShaderKey( PipelineGeometryShader geometry_shader_type, SpirvShaderTranslator::Modification vertex_shader_modification, + SpirvShaderTranslator::Modification pixel_shader_modification, GeometryShaderKey& key_out); VkShaderModule GetGeometryShader(GeometryShaderKey key); From 8fb5da18eaa8f9d96d556fb67fd090e0fc055ab8 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Tue, 26 Jul 2022 16:24:14 +0300 Subject: [PATCH 2/5] [Vulkan] Add forgotten fullDrawIndexUint32 check --- src/xenia/gpu/vulkan/vulkan_primitive_processor.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc b/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc index 058b6a5d1..b7f37f4b9 100644 --- a/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc @@ -27,14 +27,13 @@ namespace vulkan { VulkanPrimitiveProcessor::~VulkanPrimitiveProcessor() { Shutdown(true); } bool VulkanPrimitiveProcessor::Initialize() { - // TODO(Triang3l): fullDrawIndexUint32 feature check and indirect index fetch. const ui::vulkan::VulkanProvider& provider = command_processor_.GetVulkanProvider(); const VkPhysicalDeviceFeatures& device_features = provider.device_features(); const VkPhysicalDevicePortabilitySubsetFeaturesKHR* device_portability_subset_features = provider.device_portability_subset_features(); - if (!InitializeCommon(true, + if (!InitializeCommon(device_features.fullDrawIndexUint32, !device_portability_subset_features || device_portability_subset_features->triangleFans, false, device_features.geometryShader)) { From 66c995f3aa26cb3e860144d9cf38883fadf6e4e3 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Tue, 26 Jul 2022 17:04:22 +0300 Subject: [PATCH 3/5] [SPIR-V] Saturate point sprite coordinates --- src/xenia/gpu/spirv_shader_translator.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index 452ac1450..199d0f99c 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -1619,8 +1619,17 @@ void SpirvShaderTranslator::StartFragmentShaderInMain() { spv::Id param_gen_z, param_gen_w; if (modification.pixel.param_gen_point) { assert_true(input_point_coordinates_ != spv::NoResult); + // Saturate to avoid negative point coordinates if the center of the pixel + // is not covered, and extrapolation is done. + id_vector_temp_.clear(); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back( + builder_->createLoad(input_point_coordinates_, spv::NoPrecision)); + id_vector_temp_.push_back(const_float2_0_); + id_vector_temp_.push_back(const_float2_1_); spv::Id param_gen_point_coordinates = - builder_->createLoad(input_point_coordinates_, spv::NoPrecision); + builder_->createBuiltinCall(type_float2_, ext_inst_glsl_std_450_, + GLSLstd450NClamp, id_vector_temp_); param_gen_z = builder_->createCompositeExtract( param_gen_point_coordinates, type_float_, 0); param_gen_w = builder_->createCompositeExtract( From ff7ef050632fb0c85760383f911bb9328e42176c Mon Sep 17 00:00:00 2001 From: Triang3l Date: Tue, 26 Jul 2022 17:08:12 +0300 Subject: [PATCH 4/5] [SPIR-V] Clamp cube face using NClamp, not NMax/FMin --- src/xenia/gpu/spirv_shader_translator_fetch.cc | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/xenia/gpu/spirv_shader_translator_fetch.cc b/src/xenia/gpu/spirv_shader_translator_fetch.cc index 7be662460..88d3bd5ab 100644 --- a/src/xenia/gpu/spirv_shader_translator_fetch.cc +++ b/src/xenia/gpu/spirv_shader_translator_fetch.cc @@ -1296,18 +1296,14 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( builder_->addDecoration(face, spv::DecorationNoContraction); } id_vector_temp_.clear(); - id_vector_temp_.reserve(2); + id_vector_temp_.reserve(3); + id_vector_temp_.push_back(face); id_vector_temp_.push_back(const_float_0_); - id_vector_temp_.push_back(face); - face = builder_->createBuiltinCall(type_float_, ext_inst_glsl_std_450_, - GLSLstd450NMax, id_vector_temp_); - id_vector_temp_.clear(); - id_vector_temp_.reserve(2); id_vector_temp_.push_back(builder_->makeFloatConstant(5.0f)); - id_vector_temp_.push_back(face); - face = builder_->createBuiltinCall(type_float_, ext_inst_glsl_std_450_, - GLSLstd450FMin, id_vector_temp_); - face = builder_->createUnaryOp(spv::OpConvertFToU, type_uint_, face); + face = builder_->createUnaryOp( + spv::OpConvertFToU, type_uint_, + builder_->createBuiltinCall(type_float_, ext_inst_glsl_std_450_, + GLSLstd450NClamp, id_vector_temp_)); // Split the face index into the axis and the sign. spv::Id const_uint_1 = builder_->makeUintConstant(1); spv::Id face_axis = builder_->createBinOp( From 7595cdb52bd12d448aeabe4908862f59d283ce9d Mon Sep 17 00:00:00 2001 From: Triang3l Date: Wed, 27 Jul 2022 17:14:28 +0300 Subject: [PATCH 5/5] [Vulkan] Non-GS point sprites + minor SPIR-V fixes --- .../gpu/d3d12/d3d12_command_processor.cc | 7 +- .../gpu/d3d12/d3d12_primitive_processor.cc | 25 +- .../gpu/d3d12/d3d12_primitive_processor.h | 5 +- src/xenia/gpu/dxbc_shader_translator.cc | 4 - src/xenia/gpu/primitive_processor.cc | 250 +++++++-- src/xenia/gpu/primitive_processor.h | 57 ++- src/xenia/gpu/spirv_shader_translator.cc | 482 ++++++++++++++---- src/xenia/gpu/spirv_shader_translator.h | 9 +- .../gpu/vulkan/vulkan_command_processor.cc | 90 ++-- .../gpu/vulkan/vulkan_command_processor.h | 4 +- src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc | 29 +- src/xenia/gpu/vulkan/vulkan_pipeline_cache.h | 2 +- .../gpu/vulkan/vulkan_primitive_processor.cc | 26 +- .../gpu/vulkan/vulkan_primitive_processor.h | 5 +- 14 files changed, 721 insertions(+), 274 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index b6f72ff9b..129f89fd0 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -2268,7 +2268,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, UpdateSystemConstantValues( memexport_used, primitive_polygonal, primitive_processing_result.line_loop_closing_index, - primitive_processing_result.host_index_endian, viewport_info, + primitive_processing_result.host_shader_index_endian, viewport_info, used_texture_mask, normalized_depth_control, normalized_color_mask); // Update constant buffers, descriptors and root parameters. @@ -2513,7 +2513,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, } ID3D12Resource* scratch_index_buffer = nullptr; switch (primitive_processing_result.index_buffer_type) { - case PrimitiveProcessor::ProcessedIndexBufferType::kGuest: { + case PrimitiveProcessor::ProcessedIndexBufferType::kGuestDMA: { if (memexport_used) { // If the shared memory is a UAV, it can't be used as an index buffer // (UAV is a read/write state, index buffer is a read-only state). @@ -2545,7 +2545,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, primitive_processor_->GetConvertedIndexBufferGpuAddress( primitive_processing_result.host_index_buffer_handle); break; - case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltin: + case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForAuto: + case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForDMA: index_buffer_view.BufferLocation = primitive_processor_->GetBuiltinIndexBufferGpuAddress( primitive_processing_result.host_index_buffer_handle); diff --git a/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc b/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc index a806546a1..03e67d9ac 100644 --- a/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc @@ -28,7 +28,7 @@ namespace d3d12 { D3D12PrimitiveProcessor::~D3D12PrimitiveProcessor() { Shutdown(true); } bool D3D12PrimitiveProcessor::Initialize() { - if (!InitializeCommon(true, false, false, true)) { + if (!InitializeCommon(true, false, false, true, true, true)) { Shutdown(); return false; } @@ -83,9 +83,9 @@ void D3D12PrimitiveProcessor::EndFrame() { frame_index_buffers_.clear(); } -bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( - uint32_t index_count, std::function fill_callback) { - assert_not_zero(index_count); +bool D3D12PrimitiveProcessor::InitializeBuiltinIndexBuffer( + size_t size_bytes, std::function fill_callback) { + assert_not_zero(size_bytes); assert_null(builtin_index_buffer_); assert_null(builtin_index_buffer_upload_); @@ -94,9 +94,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( ID3D12Device* device = provider.GetDevice(); D3D12_RESOURCE_DESC resource_desc; - ui::d3d12::util::FillBufferResourceDesc( - resource_desc, UINT64(sizeof(uint16_t) * index_count), - D3D12_RESOURCE_FLAG_NONE); + ui::d3d12::util::FillBufferResourceDesc(resource_desc, UINT64(size_bytes), + D3D12_RESOURCE_FLAG_NONE); Microsoft::WRL::ComPtr draw_resource; if (FAILED(device->CreateCommittedResource( &ui::d3d12::util::kHeapPropertiesDefault, @@ -105,8 +104,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( IID_PPV_ARGS(&draw_resource)))) { XELOGE( "D3D12 primitive processor: Failed to create the built-in index " - "buffer GPU resource with {} 16-bit indices", - index_count); + "buffer GPU resource with {} bytes", + size_bytes); return false; } Microsoft::WRL::ComPtr upload_resource; @@ -117,8 +116,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( IID_PPV_ARGS(&upload_resource)))) { XELOGE( "D3D12 primitive processor: Failed to create the built-in index " - "buffer upload resource with {} 16-bit indices", - index_count); + "buffer upload resource with {} bytes", + size_bytes); return false; } @@ -127,8 +126,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( if (FAILED(upload_resource->Map(0, &upload_read_range, &mapping))) { XELOGE( "D3D12 primitive processor: Failed to map the built-in index buffer " - "upload resource with {} 16-bit indices", - index_count); + "upload resource with {} bytes", + size_bytes); return false; } fill_callback(reinterpret_cast(mapping)); diff --git a/src/xenia/gpu/d3d12/d3d12_primitive_processor.h b/src/xenia/gpu/d3d12/d3d12_primitive_processor.h index 81e1812a6..8ac02f4db 100644 --- a/src/xenia/gpu/d3d12/d3d12_primitive_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_primitive_processor.h @@ -56,9 +56,8 @@ class D3D12PrimitiveProcessor final : public PrimitiveProcessor { } protected: - bool InitializeBuiltin16BitIndexBuffer( - uint32_t index_count, - std::function fill_callback) override; + bool InitializeBuiltinIndexBuffer( + size_t size_bytes, std::function fill_callback) override; void* RequestHostConvertedIndexBufferForCurrentFrame( xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd, diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 602da9ce8..daa8cf782 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -964,8 +964,6 @@ void DxbcShaderTranslator::CompleteVertexOrDomainShader() { // Check if the shader returns XY/W rather than XY, and if it does, revert // that. - // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in - // affine interpolation. a_.OpAnd(temp_x_dest, flags_src, dxbc::Src::LU(kSysFlag_XYDividedByW)); a_.OpIf(true, temp_x_src); a_.OpMul(dxbc::Dest::R(system_temp_position_, 0b0011), @@ -974,8 +972,6 @@ void DxbcShaderTranslator::CompleteVertexOrDomainShader() { a_.OpEndIf(); // Check if the shader returns Z/W rather than Z, and if it does, revert that. - // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in - // affine interpolation. a_.OpAnd(temp_x_dest, flags_src, dxbc::Src::LU(kSysFlag_ZDividedByW)); a_.OpIf(true, temp_x_src); a_.OpMul(dxbc::Dest::R(system_temp_position_, 0b0100), diff --git a/src/xenia/gpu/primitive_processor.cc b/src/xenia/gpu/primitive_processor.cc index 68da6d100..827fb7b4e 100644 --- a/src/xenia/gpu/primitive_processor.cc +++ b/src/xenia/gpu/primitive_processor.cc @@ -9,6 +9,7 @@ #include "xenia/gpu/primitive_processor.h" +#include #include #include #include @@ -106,7 +107,9 @@ PrimitiveProcessor::~PrimitiveProcessor() { ShutdownCommon(); } bool PrimitiveProcessor::InitializeCommon( bool full_32bit_vertex_indices_supported, bool triangle_fans_supported, - bool line_loops_supported, bool quad_lists_supported) { + bool line_loops_supported, bool quad_lists_supported, + bool point_sprites_supported_without_vs_expansion, + bool rectangle_lists_supported_without_vs_expansion) { full_32bit_vertex_indices_used_ = full_32bit_vertex_indices_supported; convert_triangle_fans_to_lists_ = !triangle_fans_supported || cvars::force_convert_triangle_fans_to_lists; @@ -115,33 +118,94 @@ bool PrimitiveProcessor::InitializeCommon( convert_quad_lists_to_triangle_lists_ = !quad_lists_supported || cvars::force_convert_quad_lists_to_triangle_lists; + // No override cvars as hosts are not required to support the fallback paths + // since they require different vertex shader structure (for the fallback + // HostVertexShaderTypes). + expand_point_sprites_in_vs_ = !point_sprites_supported_without_vs_expansion; + expand_rectangle_lists_in_vs_ = + !rectangle_lists_supported_without_vs_expansion; // Initialize the index buffer for conversion of auto-indexed primitive types. - uint32_t builtin_index_count = 0; + size_t builtin_index_buffer_size = 0; + // 32-bit, before 16-bit due to alignment (for primitive expansion - when the + // indices encode not only the guest vertex index, but also a part needed for + // host expansion, thus may contain values above UINT16_MAX, such as up to + // (UINT16_MAX - 1) * 4 + 3 for point sprites). + // Using an index buffer for point sprite and rectangle list expansion instead + // of instancing as how instancing is implemented may vary wildly between + // GPUs, potentially slowly (like no different instances in the same + // wavefront) with small vertex counts per instance. Also using triangle + // strips with primitive restart, not triangle lists, so the vertex shader may + // be invoked once for the inner edge vertices, which is important for memory + // export in guest shaders, not to write to the same location from two + // invocations. + uint32_t builtin_ib_two_triangle_strip_count = 0; + if (expand_point_sprites_in_vs_) { + builtin_ib_two_triangle_strip_count = + std::max(uint32_t(UINT16_MAX), builtin_ib_two_triangle_strip_count); + } + if (expand_rectangle_lists_in_vs_) { + builtin_ib_two_triangle_strip_count = + std::max(uint32_t(UINT16_MAX / 3), builtin_ib_two_triangle_strip_count); + } + if (builtin_ib_two_triangle_strip_count) { + builtin_ib_offset_two_triangle_strips_ = builtin_index_buffer_size; + builtin_index_buffer_size += + sizeof(uint32_t) * + GetTwoTriangleStripIndexCount(builtin_ib_two_triangle_strip_count); + } else { + builtin_ib_offset_two_triangle_strips_ = SIZE_MAX; + } + // 16-bit (for indirection on top of single auto-indexed vertices) - enough + // even if the backend has primitive reset enabled all the time (Metal) as + // auto-indexed draws are limited to UINT16_MAX vertices, not UINT16_MAX + 1. if (convert_triangle_fans_to_lists_) { - builtin_ib_offset_triangle_fans_to_lists_ = - sizeof(uint16_t) * builtin_index_count; - builtin_index_count += GetTriangleFanListIndexCount(UINT16_MAX); + builtin_ib_offset_triangle_fans_to_lists_ = builtin_index_buffer_size; + builtin_index_buffer_size += + sizeof(uint16_t) * GetTriangleFanListIndexCount(UINT16_MAX); } else { builtin_ib_offset_triangle_fans_to_lists_ = SIZE_MAX; } if (convert_quad_lists_to_triangle_lists_) { - builtin_ib_offset_quad_lists_to_triangle_lists_ = - sizeof(uint16_t) * builtin_index_count; - builtin_index_count += GetQuadListTriangleListIndexCount(UINT16_MAX); + builtin_ib_offset_quad_lists_to_triangle_lists_ = builtin_index_buffer_size; + builtin_index_buffer_size += + sizeof(uint16_t) * GetQuadListTriangleListIndexCount(UINT16_MAX); } else { builtin_ib_offset_quad_lists_to_triangle_lists_ = SIZE_MAX; } - if (builtin_index_count) { - if (!InitializeBuiltin16BitIndexBuffer( - builtin_index_count, [this](uint16_t* mapping) { + if (builtin_index_buffer_size) { + if (!InitializeBuiltinIndexBuffer( + builtin_index_buffer_size, + [this, builtin_ib_two_triangle_strip_count](void* mapping) { + uint32_t* mapping_32bit = reinterpret_cast(mapping); + if (builtin_ib_offset_two_triangle_strips_ != SIZE_MAX) { + // Two-triangle strips. + uint32_t* two_triangle_strip_ptr = + mapping_32bit + + builtin_ib_offset_two_triangle_strips_ / sizeof(uint32_t); + for (uint32_t i = 0; i < builtin_ib_two_triangle_strip_count; + ++i) { + if (i) { + // Primitive restart. + *(two_triangle_strip_ptr++) = UINT32_MAX; + } + // Host vertex index within the pair in the lower 2 bits, + // guest primitive index in the rest. + uint32_t two_triangle_strip_first_index = i << 2; + for (uint32_t j = 0; j < 4; ++j) { + *(two_triangle_strip_ptr++) = + two_triangle_strip_first_index + j; + } + } + } + uint16_t* mapping_16bit = reinterpret_cast(mapping); if (builtin_ib_offset_triangle_fans_to_lists_ != SIZE_MAX) { // Triangle fans as triangle lists. // Ordered as (v1, v2, v0), (v2, v3, v0) in Direct3D. // https://docs.microsoft.com/en-us/windows/desktop/direct3d9/triangle-fans uint16_t* triangle_list_ptr = - mapping + builtin_ib_offset_triangle_fans_to_lists_ / - sizeof(uint16_t); + mapping_16bit + builtin_ib_offset_triangle_fans_to_lists_ / + sizeof(uint16_t); for (uint32_t i = 2; i < UINT16_MAX; ++i) { *(triangle_list_ptr++) = uint16_t(i - 1); *(triangle_list_ptr++) = uint16_t(i); @@ -150,8 +214,9 @@ bool PrimitiveProcessor::InitializeCommon( } if (builtin_ib_offset_quad_lists_to_triangle_lists_ != SIZE_MAX) { uint16_t* triangle_list_ptr = - mapping + builtin_ib_offset_quad_lists_to_triangle_lists_ / - sizeof(uint16_t); + mapping_16bit + + builtin_ib_offset_quad_lists_to_triangle_lists_ / + sizeof(uint16_t); // TODO(Triang3l): SIMD for faster initialization? for (uint32_t i = 0; i < UINT16_MAX / 4; ++i) { uint16_t quad_first_index = uint16_t(i * 4); @@ -309,15 +374,27 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { return false; } } else { + host_vertex_shader_type = Shader::HostVertexShaderType::kVertex; switch (guest_primitive_type) { case xenos::PrimitiveType::kPointList: + if (expand_point_sprites_in_vs_) { + host_primitive_type = xenos::PrimitiveType::kTriangleStrip; + host_vertex_shader_type = + Shader::HostVertexShaderType::kPointListAsTriangleStrip; + } + break; case xenos::PrimitiveType::kLineList: case xenos::PrimitiveType::kLineStrip: case xenos::PrimitiveType::kTriangleList: case xenos::PrimitiveType::kTriangleStrip: + // Supported natively on all backends. + break; case xenos::PrimitiveType::kRectangleList: - // Supported natively or through geometry or compute shaders on all - // backends. + if (expand_rectangle_lists_in_vs_) { + host_primitive_type = xenos::PrimitiveType::kTriangleStrip; + host_vertex_shader_type = + Shader::HostVertexShaderType::kRectangleListAsTriangleStrip; + } break; case xenos::PrimitiveType::kTriangleFan: if (convert_triangle_fans_to_lists_) { @@ -342,7 +419,6 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { assert_always(); return false; } - host_vertex_shader_type = Shader::HostVertexShaderType::kVertex; } // Process the indices. @@ -359,12 +435,86 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { guest_draw_vertex_count = vgt_dma_size.num_words; } uint32_t line_loop_closing_index = 0; - uint32_t guest_index_base; + uint32_t guest_index_base = 0, guest_index_buffer_needed_bytes = 0; CachedResult cacheable; cacheable.host_draw_vertex_count = guest_draw_vertex_count; cacheable.host_primitive_reset_enabled = false; cacheable.host_index_buffer_handle = SIZE_MAX; - if (vgt_draw_initiator.source_select == xenos::SourceSelect::kAutoIndex) { + if (host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip || + host_vertex_shader_type == + Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) { + // As two-triangle strips, with guest indices being either autogenerated or + // fetched via DMA. + uint32_t primitive_count = guest_draw_vertex_count; + if (host_vertex_shader_type == + Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) { + primitive_count /= 3; + } + cacheable.host_draw_vertex_count = + GetTwoTriangleStripIndexCount(primitive_count); + cacheable.host_index_format = xenos::IndexFormat::kInt32; + cacheable.host_primitive_reset_enabled = true; + assert_true(builtin_ib_offset_two_triangle_strips_ != SIZE_MAX); + cacheable.host_index_buffer_handle = builtin_ib_offset_two_triangle_strips_; + if (vgt_draw_initiator.source_select == xenos::SourceSelect::kAutoIndex) { + cacheable.index_buffer_type = + ProcessedIndexBufferType::kHostBuiltinForAuto; + cacheable.host_shader_index_endian = xenos::Endian::kNone; + } else { + // There is an index buffer. + assert_true(vgt_draw_initiator.source_select == + xenos::SourceSelect::kDMA); + if (vgt_draw_initiator.source_select != xenos::SourceSelect::kDMA) { + // TODO(Triang3l): Support immediate-indexed vertices. + XELOGE( + "Primitive processor: Unsupported vertex index source {}. Report " + "the game to Xenia developers!", + uint32_t(vgt_draw_initiator.source_select)); + return false; + } + xenos::IndexFormat guest_index_format = vgt_draw_initiator.index_size; + // Normalize the endian. + cacheable.index_buffer_type = + ProcessedIndexBufferType::kHostBuiltinForDMA; + xenos::Endian guest_index_endian = vgt_dma_size.swap_mode; + if (guest_index_format == xenos::IndexFormat::kInt16 && + (guest_index_endian != xenos::Endian::kNone && + guest_index_endian != xenos::Endian::k8in16)) { + XELOGW( + "Primitive processor: 32-bit endian swap mode {} is used for " + "16-bit indices. This shouldn't normally be happening, but report " + "the game to Xenia developers for investigation of the intended " + "behavior (ignore or actually swap across adjacent indices)! " + "Currently disabling the swap for 16-and-32 and replacing 8-in-32 " + "with 8-in-16.", + uint32_t(guest_index_endian)); + guest_index_endian = guest_index_endian == xenos::Endian::k8in32 + ? xenos::Endian::k8in16 + : xenos::Endian::kNone; + } + cacheable.host_shader_index_endian = guest_index_endian; + // Get the index buffer memory range. + uint32_t index_size_log2 = + guest_index_format == xenos::IndexFormat::kInt16 ? 1 : 2; + // The base should already be aligned, but aligning here too for safety. + guest_index_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32 & + ~uint32_t((1 << index_size_log2) - 1); + guest_index_buffer_needed_bytes = guest_draw_vertex_count + << index_size_log2; + if (guest_index_base > SharedMemory::kBufferSize || + SharedMemory::kBufferSize - guest_index_base < + guest_index_buffer_needed_bytes) { + XELOGE( + "Primitive processor: Index buffer at 0x{:08X}, 0x{:X} bytes " + "required, is out of the physical memory bounds", + guest_index_base, guest_index_buffer_needed_bytes); + assert_always(); + return false; + } + } + } else if (vgt_draw_initiator.source_select == + xenos::SourceSelect::kAutoIndex) { // Auto-indexed - use a remapping index buffer if needed to change the // primitive type. if (tessellation_enabled && @@ -376,9 +526,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { assert_always(); return false; } - guest_index_base = 0; cacheable.host_index_format = xenos::IndexFormat::kInt16; - cacheable.host_index_endian = xenos::Endian::kNone; + cacheable.host_shader_index_endian = xenos::Endian::kNone; cacheable.host_primitive_reset_enabled = false; cacheable.index_buffer_type = ProcessedIndexBufferType::kNone; if (host_primitive_type != guest_primitive_type) { @@ -388,7 +537,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { xenos::PrimitiveType::kTriangleList); cacheable.host_draw_vertex_count = GetTriangleFanListIndexCount(cacheable.host_draw_vertex_count); - cacheable.index_buffer_type = ProcessedIndexBufferType::kHostBuiltin; + cacheable.index_buffer_type = + ProcessedIndexBufferType::kHostBuiltinForAuto; assert_true(builtin_ib_offset_triangle_fans_to_lists_ != SIZE_MAX); cacheable.host_index_buffer_handle = builtin_ib_offset_triangle_fans_to_lists_; @@ -409,7 +559,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { xenos::PrimitiveType::kTriangleList); cacheable.host_draw_vertex_count = GetQuadListTriangleListIndexCount( cacheable.host_draw_vertex_count); - cacheable.index_buffer_type = ProcessedIndexBufferType::kHostBuiltin; + cacheable.index_buffer_type = + ProcessedIndexBufferType::kHostBuiltinForAuto; assert_true(builtin_ib_offset_quad_lists_to_triangle_lists_ != SIZE_MAX); cacheable.host_index_buffer_handle = @@ -503,8 +654,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { // The base should already be aligned, but aligning here too for safety. guest_index_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32 & ~uint32_t((1 << index_size_log2) - 1); - uint32_t guest_index_buffer_needed_bytes = guest_draw_vertex_count - << index_size_log2; + guest_index_buffer_needed_bytes = guest_draw_vertex_count + << index_size_log2; if (guest_index_base > SharedMemory::kBufferSize || SharedMemory::kBufferSize - guest_index_base < guest_index_buffer_needed_bytes) { @@ -517,7 +668,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { } cacheable.host_index_format = guest_index_format; - cacheable.host_index_endian = guest_index_endian; + cacheable.host_shader_index_endian = guest_index_endian; uint32_t guest_index_mask_guest_endian = guest_index_format == xenos::IndexFormat::kInt16 ? UINT16_MAX @@ -666,7 +817,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { assert_unhandled_case(guest_index_endian); return false; } - cacheable.host_index_endian = xenos::Endian::kNone; + cacheable.host_shader_index_endian = xenos::Endian::kNone; } } cache_transaction.SetNewResult(cacheable); @@ -677,7 +828,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { // endian-swap, or even to safely drop the upper 8 bits if no swap is even // needed) indirectly. cacheable.host_draw_vertex_count = guest_draw_vertex_count; - cacheable.index_buffer_type = ProcessedIndexBufferType::kGuest; + cacheable.index_buffer_type = ProcessedIndexBufferType::kGuestDMA; cacheable.host_primitive_reset_enabled = guest_primitive_reset_enabled; if (guest_primitive_reset_enabled) { if (guest_index_format == xenos::IndexFormat::kInt16) { @@ -742,8 +893,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { } else { // Low 24 bits of the guest index are compared to the primitive reset // index. If the backend doesn't support full 32-bit indices, for - // ProcessedIndexBufferType::kGuest, the host needs to read the buffer - // indirectly in the vertex shaders and swap, and for + // ProcessedIndexBufferType::kGuestDMA, the host needs to read the + // buffer indirectly in the vertex shaders and swap, and for // ProcessedIndexBufferType::kHostConverted (if primitive reset is // actually used, thus exactly 0xFFFFFFFF must be sent to the host for // it in a true index buffer), no indirection is done, but @@ -800,26 +951,31 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { assert_unhandled_case(guest_index_endian); return false; } - cacheable.host_index_endian = full_32bit_vertex_indices_used_ - ? guest_index_endian - : xenos::Endian::kNone; + cacheable.host_shader_index_endian = + full_32bit_vertex_indices_used_ ? guest_index_endian + : xenos::Endian::kNone; } cache_transaction.SetNewResult(cacheable); } } } - if (cacheable.index_buffer_type == ProcessedIndexBufferType::kGuest) { - // Request the index buffer memory. - // TODO(Triang3l): Shared memory request cache. - if (!shared_memory_.RequestRange(guest_index_base, - guest_index_buffer_needed_bytes)) { - XELOGE( - "PrimitiveProcessor: Failed to request index buffer 0x{:08X}, " - "0x{:X} bytes needed, in the shared memory", - guest_index_base, guest_index_buffer_needed_bytes); - return false; - } - } + } + } + + // Request the indices in the shared memory if they need to be accessed from + // there on the GPU. + if (cacheable.index_buffer_type == ProcessedIndexBufferType::kGuestDMA || + cacheable.index_buffer_type == + ProcessedIndexBufferType::kHostBuiltinForDMA) { + // Request the index buffer memory. + // TODO(Triang3l): Shared memory request cache. + if (!shared_memory_.RequestRange(guest_index_base, + guest_index_buffer_needed_bytes)) { + XELOGE( + "PrimitiveProcessor: Failed to request index buffer 0x{:08X}, 0x{:X} " + "bytes needed, in the shared memory", + guest_index_base, guest_index_buffer_needed_bytes); + return false; } } @@ -832,7 +988,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { result_out.index_buffer_type = cacheable.index_buffer_type; result_out.guest_index_base = guest_index_base; result_out.host_index_format = cacheable.host_index_format; - result_out.host_index_endian = cacheable.host_index_endian; + result_out.host_shader_index_endian = cacheable.host_shader_index_endian; result_out.host_primitive_reset_enabled = cacheable.host_primitive_reset_enabled; result_out.host_index_buffer_handle = cacheable.host_index_buffer_handle; diff --git a/src/xenia/gpu/primitive_processor.h b/src/xenia/gpu/primitive_processor.h index cfbec0ae9..6a77a3d0f 100644 --- a/src/xenia/gpu/primitive_processor.h +++ b/src/xenia/gpu/primitive_processor.h @@ -10,6 +10,7 @@ #ifndef XENIA_GPU_PRIMITIVE_PROCESSOR_H_ #define XENIA_GPU_PRIMITIVE_PROCESSOR_H_ +#include #include #include #include @@ -110,13 +111,16 @@ class PrimitiveProcessor { // For 32-bit, indirection is needed if the host only supports 24-bit // indices (even for non-endian-swapped, as the GPU should be ignoring the // upper 8 bits completely, rather than exhibiting undefined behavior. - kGuest, + kGuestDMA, // Converted and stored in the primitive converter for the current draw // command. For 32-bit indices, if the host doesn't support all 32 bits, // this kind of an index buffer will always be pre-masked and pre-swapped. kHostConverted, // Auto-indexed on the guest, but with an adapter index buffer on the host. - kHostBuiltin, + kHostBuiltinForAuto, + // Adapter index buffer on the host for indirect loading of indices via DMA + // (from the shared memory). + kHostBuiltinForDMA, }; struct ProcessingResult { @@ -136,13 +140,14 @@ class PrimitiveProcessor { ProcessedIndexBufferType index_buffer_type; uint32_t guest_index_base; xenos::IndexFormat host_index_format; - xenos::Endian host_index_endian; + xenos::Endian host_shader_index_endian; // The reset index, if enabled, is always 0xFFFF for host_index_format // kInt16 and 0xFFFFFFFF for kInt32. Never enabled for "list" primitive // types, thus safe for direct usage on Vulkan. bool host_primitive_reset_enabled; // Backend-specific handle for the index buffer valid for the current draw, - // only valid for index_buffer_type kHostConverted and kHostBuiltin. + // only valid for index_buffer_type kHostConverted, kHostBuiltinForAuto and + // kHostBuiltinForDMA. size_t host_index_buffer_handle; bool IsTessellated() const { return Shader::IsHostVertexShaderTypeDomain(host_vertex_shader_type); @@ -165,6 +170,12 @@ class PrimitiveProcessor { bool IsConvertingQuadListsToTriangleLists() const { return convert_quad_lists_to_triangle_lists_; } + bool IsExpandingPointSpritesInVS() const { + return expand_point_sprites_in_vs_; + } + bool IsExpandingRectangleListsInVS() const { + return expand_rectangle_lists_in_vs_; + } // Submission must be open to call (may request the index buffer in the shared // memory). @@ -217,8 +228,8 @@ class PrimitiveProcessor { // if indirection may be needed. // - When full 32-bit indices are not supported, the host must be using // auto-indexed draws for 32-bit indices of ProcessedIndexBufferType - // kGuest, while fetching the index data manually from the shared memory - // buffer and endian-swapping it. + // kGuestDMA, while fetching the index data manually from the shared + // memory buffer and endian-swapping it. // - Indirection, however, precludes primitive reset usage - so if // primitive reset is needed, the primitive processor will pre-swap and // pre-mask the index buffer so there are only host-endian 0x00###### or @@ -235,19 +246,26 @@ class PrimitiveProcessor { // those guest primitive types directly or through geometry shader // emulation. Debug overriding will be resolved in the common code if // needed. + // - point_sprites_supported_without_vs_expansion, + // rectangle_lists_supported_without_vs_expansion: + // - Pass true or false depending on whether the host actually supports + // those guest primitive types directly or through geometry shader + // emulation. Overrides do not apply to these as hosts are not required to + // support the fallback paths since they require different vertex shader + // structure (for the fallback HostVertexShaderTypes). bool InitializeCommon(bool full_32bit_vertex_indices_supported, bool triangle_fans_supported, bool line_loops_supported, - bool quad_lists_supported); + bool quad_lists_supported, + bool point_sprites_supported_without_vs_expansion, + bool rectangle_lists_supported_without_vs_expansion); // If any primitive type conversion is needed for auto-indexed draws, called // from InitializeCommon (thus only once in the primitive processor's // lifetime) to set up the backend's index buffer containing indices for - // primitive type remapping. The backend must allocate a `sizeof(uint16_t) * - // index_count` buffer and call fill_callback for its mapping if creation is - // successful. 16-bit indices are enough even if the backend has primitive - // reset enabled all the time (Metal) as auto-indexed draws are limited to - // UINT16_MAX vertices, not UINT16_MAX + 1. - virtual bool InitializeBuiltin16BitIndexBuffer( - uint32_t index_count, std::function fill_callback) = 0; + // primitive type remapping. The backend must allocate a 4-byte-aligned buffer + // with `size_bytes` and call fill_callback for its mapping if creation has + // been successful. + virtual bool InitializeBuiltinIndexBuffer( + size_t size_bytes, std::function fill_callback) = 0; // Call last in implementation-specific shutdown, also callable from the // destructor. void ShutdownCommon(); @@ -509,6 +527,12 @@ class PrimitiveProcessor { } }; + static constexpr uint32_t GetTwoTriangleStripIndexCount( + uint32_t strip_count) { + // 4 vertices per strip, and primitive restarts between strips. + return 4 * strip_count + (std::max(strip_count, UINT32_C(1)) - 1); + } + // Triangle fan test cases: // - 4D5307E6 - main menu - game logo, developer logo, backgrounds of the menu // item list (the whole menu and individual items) - no index buffer. @@ -675,8 +699,11 @@ class PrimitiveProcessor { bool convert_triangle_fans_to_lists_ = false; bool convert_line_loops_to_strips_ = false; bool convert_quad_lists_to_triangle_lists_ = false; + bool expand_point_sprites_in_vs_ = false; + bool expand_rectangle_lists_in_vs_ = false; // Byte offsets used, for simplicity, directly as handles. + size_t builtin_ib_offset_two_triangle_strips_ = SIZE_MAX; size_t builtin_ib_offset_triangle_fans_to_lists_ = SIZE_MAX; size_t builtin_ib_offset_quad_lists_to_triangle_lists_ = SIZE_MAX; @@ -745,7 +772,7 @@ class PrimitiveProcessor { uint32_t host_draw_vertex_count; ProcessedIndexBufferType index_buffer_type; xenos::IndexFormat host_index_format; - xenos::Endian host_index_endian; + xenos::Endian host_shader_index_endian; bool host_primitive_reset_enabled; size_t host_index_buffer_handle; }; diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index 199d0f99c..bb89e0d41 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -111,6 +111,7 @@ void SpirvShaderTranslator::Reset() { input_front_facing_ = spv::NoResult; std::fill(input_output_interpolators_.begin(), input_output_interpolators_.end(), spv::NoResult); + output_point_coordinates_ = spv::NoResult; output_point_size_ = spv::NoResult; sampler_bindings_.clear(); @@ -1097,18 +1098,33 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderBeforeMain() { Modification shader_modification = GetSpirvShaderModification(); - // Create the point size output. Not using gl_PointSize from gl_PerVertex not - // to rely on the shaderTessellationAndGeometryPointSize feature, and also - // because the value written to gl_PointSize must be greater than zero. - if (shader_modification.vertex.output_point_size) { - output_point_size_ = - builder_->createVariable(spv::NoPrecision, spv::StorageClassOutput, - type_float_, "xe_out_point_size"); - builder_->addDecoration(output_point_size_, spv::DecorationLocation, - int(output_location)); - builder_->addDecoration(output_point_size_, spv::DecorationInvariant); - main_interface_.push_back(output_point_size_); - ++output_location; + if (shader_modification.vertex.output_point_parameters) { + if (shader_modification.vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip) { + // Create the point coordinates output. + output_point_coordinates_ = + builder_->createVariable(spv::NoPrecision, spv::StorageClassOutput, + type_float2_, "xe_out_point_coordinates"); + builder_->addDecoration(output_point_coordinates_, + spv::DecorationLocation, int(output_location)); + builder_->addDecoration(output_point_coordinates_, + spv::DecorationInvariant); + main_interface_.push_back(output_point_coordinates_); + ++output_location; + } else { + // Create the point size output. Not using gl_PointSize from gl_PerVertex + // not to rely on the shaderTessellationAndGeometryPointSize feature, and + // also because the value written to gl_PointSize must be greater than + // zero. + output_point_size_ = + builder_->createVariable(spv::NoPrecision, spv::StorageClassOutput, + type_float_, "xe_out_point_size"); + builder_->addDecoration(output_point_size_, spv::DecorationLocation, + int(output_location)); + builder_->addDecoration(output_point_size_, spv::DecorationInvariant); + main_interface_.push_back(output_point_size_); + ++output_location; + } } // Create the gl_PerVertex output for used system outputs. @@ -1172,24 +1188,33 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { } } + Modification shader_modification = GetSpirvShaderModification(); + + // TODO(Triang3l): For HostVertexShaderType::kRectangeListAsTriangleStrip, + // start the vertex loop, and load the index there. + // Load the vertex index or the tessellation parameters. if (register_count()) { // TODO(Triang3l): Barycentric coordinates and patch index. if (IsSpirvVertexShader()) { - // TODO(Triang3l): Close line loop primitive. - // Load the unswapped index as uint for swapping, or for indirect loading - // if needed. spv::Id vertex_index = builder_->createUnaryOp( spv::OpBitcast, type_uint_, builder_->createLoad(input_vertex_index_, spv::NoPrecision)); - if (!features_.full_draw_index_uint32) { - // Check if the full 32-bit index needs to be loaded indirectly. + if (shader_modification.vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip) { + // Load the point index, autogenerated or indirectly from the index + // buffer. + // Extract the primitive index from the two-triangle strip vertex index. + spv::Id const_uint_2 = builder_->makeUintConstant(2); + vertex_index = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, vertex_index, const_uint_2); + // Check if the index needs to be loaded from the index buffer. spv::Id load_vertex_index = builder_->createBinOp( spv::OpINotEqual, type_bool_, builder_->createBinOp( spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, - builder_->makeUintConstant( - static_cast(kSysFlag_VertexIndexLoad))), + builder_->makeUintConstant(static_cast( + kSysFlag_ComputeOrPrimitiveVertexIndexLoad))), const_uint_0_); spv::Block& block_load_vertex_index_pre = *builder_->getBuildPoint(); spv::Block& block_load_vertex_index_start = builder_->makeNewBlock(); @@ -1200,25 +1225,61 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { &block_load_vertex_index_start, &block_load_vertex_index_merge); builder_->setBuildPoint(&block_load_vertex_index_start); - // Load the 32-bit index. - // TODO(Triang3l): Bounds checking. + // Check if the index is 32-bit. + spv::Id vertex_index_is_32bit = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(static_cast( + kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit))), + const_uint_0_); + // Calculate the vertex index address in the shared memory. id_vector_temp_.clear(); id_vector_temp_.push_back( builder_->makeIntConstant(kSystemConstantVertexIndexLoadAddress)); + spv::Id vertex_index_address = builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision), + builder_->createBinOp( + spv::OpShiftLeftLogical, type_uint_, vertex_index, + builder_->createTriOp(spv::OpSelect, type_uint_, + vertex_index_is_32bit, const_uint_2, + builder_->makeUintConstant(1)))); + // Load the 32 bits containing the whole vertex index or two 16-bit + // vertex indices. + // TODO(Triang3l): Bounds checking. spv::Id loaded_vertex_index = LoadUint32FromSharedMemory(builder_->createUnaryOp( spv::OpBitcast, type_int_, + builder_->createBinOp(spv::OpShiftRightLogical, type_uint_, + vertex_index_address, const_uint_2))); + // Extract the 16-bit index from the loaded 32 bits if needed. + loaded_vertex_index = builder_->createTriOp( + spv::OpSelect, type_uint_, vertex_index_is_32bit, + loaded_vertex_index, + builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, loaded_vertex_index, builder_->createBinOp( - spv::OpIAdd, type_uint_, - builder_->createBinOp( - spv::OpShiftRightLogical, type_uint_, - builder_->createLoad( - builder_->createAccessChain( - spv::StorageClassUniform, - uniform_system_constants_, id_vector_temp_), - spv::NoPrecision), - builder_->makeUintConstant(2)), - vertex_index))); + spv::OpShiftLeftLogical, type_uint_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + vertex_index_address, const_uint_2), + builder_->makeUintConstant(4 - 1)), + builder_->makeUintConstant(16))); + // Endian-swap the loaded index. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantVertexIndexEndian)); + loaded_vertex_index = EndianSwap32Uint( + loaded_vertex_index, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision)); // Get the actual build point for phi. spv::Block& block_load_vertex_index_end = *builder_->getBuildPoint(); builder_->createBranch(&block_load_vertex_index_merge); @@ -1238,19 +1299,81 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { builder_->getBuildPoint()->addInstruction( std::move(loaded_vertex_index_phi_op)); } + } else { + // TODO(Triang3l): Close line loop primitive. + // Load the unswapped index as uint for swapping, or for indirect + // loading if needed. + if (!features_.full_draw_index_uint32) { + // Check if the full 32-bit index needs to be loaded indirectly. + spv::Id load_vertex_index = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant( + static_cast(kSysFlag_VertexIndexLoad))), + const_uint_0_); + spv::Block& block_load_vertex_index_pre = *builder_->getBuildPoint(); + spv::Block& block_load_vertex_index_start = builder_->makeNewBlock(); + spv::Block& block_load_vertex_index_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_load_vertex_index_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(load_vertex_index, + &block_load_vertex_index_start, + &block_load_vertex_index_merge); + builder_->setBuildPoint(&block_load_vertex_index_start); + // Load the 32-bit index. + // TODO(Triang3l): Bounds checking. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantVertexIndexLoadAddress)); + spv::Id loaded_vertex_index = + LoadUint32FromSharedMemory(builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, + builder_->createLoad( + builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision), + builder_->makeUintConstant(2)), + vertex_index))); + // Get the actual build point for phi. + spv::Block& block_load_vertex_index_end = *builder_->getBuildPoint(); + builder_->createBranch(&block_load_vertex_index_merge); + // Select between the loaded index and the original index from Vulkan. + builder_->setBuildPoint(&block_load_vertex_index_merge); + { + std::unique_ptr loaded_vertex_index_phi_op = + std::make_unique(builder_->getUniqueId(), + type_uint_, spv::OpPhi); + loaded_vertex_index_phi_op->addIdOperand(loaded_vertex_index); + loaded_vertex_index_phi_op->addIdOperand( + block_load_vertex_index_end.getId()); + loaded_vertex_index_phi_op->addIdOperand(vertex_index); + loaded_vertex_index_phi_op->addIdOperand( + block_load_vertex_index_pre.getId()); + vertex_index = loaded_vertex_index_phi_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(loaded_vertex_index_phi_op)); + } + } + // Endian-swap the index. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantVertexIndexEndian)); + vertex_index = EndianSwap32Uint( + vertex_index, builder_->createLoad( + builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision)); } - // Endian-swap the index and convert to int. - id_vector_temp_.clear(); - id_vector_temp_.push_back( - builder_->makeIntConstant(kSystemConstantVertexIndexEndian)); - spv::Id vertex_index_endian = - builder_->createLoad(builder_->createAccessChain( - spv::StorageClassUniform, - uniform_system_constants_, id_vector_temp_), - spv::NoPrecision); - vertex_index = builder_->createUnaryOp( - spv::OpBitcast, type_int_, - EndianSwap32Uint(vertex_index, vertex_index_endian)); + // Convert the index to a signed integer. + vertex_index = + builder_->createUnaryOp(spv::OpBitcast, type_int_, vertex_index); // Add the base to the index. id_vector_temp_.clear(); id_vector_temp_.push_back( @@ -1301,61 +1424,66 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() { builder_->createTriOp(spv::OpSelect, type_float_, is_w_not_reciprocal, position_w, guest_position_w_inv); - // Check if the shader returns XY/W rather than XY, and if it does, revert - // that. - // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in - // affine interpolation. - uint_vector_temp_.clear(); - uint_vector_temp_.reserve(2); - uint_vector_temp_.push_back(0); - uint_vector_temp_.push_back(1); - spv::Id position_xy = builder_->createRvalueSwizzle( - spv::NoPrecision, type_float2_, guest_position, uint_vector_temp_); - spv::Id is_xy_divided_by_w = builder_->createBinOp( - spv::OpINotEqual, type_bool_, - builder_->createBinOp( - spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, - builder_->makeUintConstant( - static_cast(kSysFlag_XYDividedByW))), - const_uint_0_); - spv::Id guest_position_xy_mul_w = builder_->createBinOp( - spv::OpVectorTimesScalar, type_float2_, position_xy, position_w); - builder_->addDecoration(guest_position_xy_mul_w, - spv::DecorationNoContraction); - position_xy = - builder_->createTriOp(spv::OpSelect, type_float2_, is_xy_divided_by_w, - guest_position_xy_mul_w, position_xy); - - // Check if the shader returns Z/W rather than Z, and if it does, revert that. - // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in - // affine interpolation. - spv::Id position_z = - builder_->createCompositeExtract(guest_position, type_float_, 2); - spv::Id is_z_divided_by_w = builder_->createBinOp( - spv::OpINotEqual, type_bool_, - builder_->createBinOp( - spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, - builder_->makeUintConstant( - static_cast(kSysFlag_ZDividedByW))), - const_uint_0_); - spv::Id guest_position_z_mul_w = - builder_->createBinOp(spv::OpFMul, type_float_, position_z, position_w); - builder_->addDecoration(guest_position_z_mul_w, spv::DecorationNoContraction); - position_z = - builder_->createTriOp(spv::OpSelect, type_float_, is_z_divided_by_w, - guest_position_z_mul_w, position_z); - - // Build XYZ of the position with W format handled. spv::Id position_xyz; + + // Open a scope since position_xy and position_z won't be synchronized anymore + // after position_xyz is built and modified later. { - std::unique_ptr composite_construct_op = - std::make_unique( - builder_->getUniqueId(), type_float3_, spv::OpCompositeConstruct); - composite_construct_op->addIdOperand(position_xy); - composite_construct_op->addIdOperand(position_z); - position_xyz = composite_construct_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(composite_construct_op)); + // Check if the shader returns XY/W rather than XY, and if it does, revert + // that. + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(2); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + spv::Id position_xy = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float2_, guest_position, uint_vector_temp_); + spv::Id is_xy_divided_by_w = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant( + static_cast(kSysFlag_XYDividedByW))), + const_uint_0_); + spv::Id guest_position_xy_mul_w = builder_->createBinOp( + spv::OpVectorTimesScalar, type_float2_, position_xy, position_w); + builder_->addDecoration(guest_position_xy_mul_w, + spv::DecorationNoContraction); + position_xy = builder_->createTriOp( + spv::OpSelect, type_float2_, + builder_->smearScalar(spv::NoPrecision, is_xy_divided_by_w, + type_bool2_), + guest_position_xy_mul_w, position_xy); + + // Check if the shader returns Z/W rather than Z, and if it does, revert + // that. + spv::Id position_z = + builder_->createCompositeExtract(guest_position, type_float_, 2); + spv::Id is_z_divided_by_w = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant( + static_cast(kSysFlag_ZDividedByW))), + const_uint_0_); + spv::Id guest_position_z_mul_w = + builder_->createBinOp(spv::OpFMul, type_float_, position_z, position_w); + builder_->addDecoration(guest_position_z_mul_w, + spv::DecorationNoContraction); + position_z = + builder_->createTriOp(spv::OpSelect, type_float_, is_z_divided_by_w, + guest_position_z_mul_w, position_z); + + // Build XYZ of the position with W format handled. + { + std::unique_ptr composite_construct_op = + std::make_unique( + builder_->getUniqueId(), type_float3_, spv::OpCompositeConstruct); + composite_construct_op->addIdOperand(position_xy); + composite_construct_op->addIdOperand(position_z); + position_xyz = composite_construct_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(composite_construct_op)); + } } // Apply the NDC scale and offset for guest to host viewport transformation. @@ -1382,20 +1510,6 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() { ndc_offset_mul_w); builder_->addDecoration(position_xyz, spv::DecorationNoContraction); - // Store the position converted to the host. - spv::Id position; - { - std::unique_ptr composite_construct_op = - std::make_unique( - builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct); - composite_construct_op->addIdOperand(position_xyz); - composite_construct_op->addIdOperand(position_w); - position = composite_construct_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(composite_construct_op)); - } - builder_->createStore(position, position_ptr); - // Write the point size. if (output_point_size_ != spv::NoResult) { spv::Id point_size; @@ -1415,6 +1529,154 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() { } builder_->createStore(point_size, output_point_size_); } + + Modification shader_modification = GetSpirvShaderModification(); + + // Expand the point sprite. + if (shader_modification.vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip) { + // Top-left, bottom-left, top-right, bottom-right order (chosen arbitrarily, + // simply based on counterclockwise meaning front with + // frontFace = VkFrontFace(0), but faceness is ignored for non-polygon + // primitive types). + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(builder_->makeUintConstant(0b10)); + id_vector_temp_.push_back(builder_->makeUintConstant(0b01)); + spv::Id point_vertex_positive = builder_->createBinOp( + spv::OpINotEqual, type_bool2_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint2_, + builder_->smearScalar(spv::NoPrecision, + builder_->createUnaryOp( + spv::OpBitcast, type_uint_, + builder_->createLoad(input_vertex_index_, + spv::NoPrecision)), + type_uint2_), + builder_->createCompositeConstruct(type_uint2_, id_vector_temp_)), + SpirvSmearScalarResultOrConstant(const_uint_0_, type_uint2_)); + + // Load the point diameter in guest pixels, with the override from the + // vertex shader if provided. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantPointConstantDiameter)); + spv::Id point_guest_diameter = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + if (current_shader().writes_point_size_edge_flag_kill_vertex() & 0b001) { + assert_true(var_main_point_size_edge_flag_kill_vertex_ != spv::NoResult); + id_vector_temp_.clear(); + id_vector_temp_.push_back(const_int_0_); + spv::Id point_vertex_diameter = builder_->createLoad( + builder_->createAccessChain( + spv::StorageClassFunction, + var_main_point_size_edge_flag_kill_vertex_, id_vector_temp_), + spv::NoPrecision); + // The vertex shader's header writes -1.0 to point_size by default, so any + // non-negative value means that it was overwritten by the translated + // vertex shader, and needs to be used instead of the constant size. The + // per-vertex diameter has already been clamped earlier in translation + // (combined with making it non-negative). + point_guest_diameter = builder_->createTriOp( + spv::OpSelect, type_float2_, + builder_->smearScalar( + spv::NoPrecision, + builder_->createBinOp(spv::OpFOrdGreaterThanEqual, type_bool_, + point_vertex_diameter, const_float_0_), + type_bool2_), + builder_->smearScalar(spv::NoPrecision, point_vertex_diameter, + type_float2_), + point_guest_diameter); + } + // Transform the diameter in the guest screen coordinates to radius in the + // normalized device coordinates. + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeIntConstant( + kSystemConstantPointScreenDiameterToNdcRadius)); + spv::Id point_radius = builder_->createBinOp( + spv::OpFMul, type_float2_, point_guest_diameter, + builder_->createLoad(builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision)); + builder_->addDecoration(point_radius, spv::DecorationNoContraction); + // Transform the radius from the normalized device coordinates to the clip + // space. + point_radius = builder_->createBinOp(spv::OpVectorTimesScalar, type_float2_, + point_radius, position_w); + builder_->addDecoration(point_radius, spv::DecorationNoContraction); + + // Apply the direction of expansion for the current host vertex. + spv::Id point_radius_negative = + builder_->createUnaryOp(spv::OpFNegate, type_float2_, point_radius); + builder_->addDecoration(point_radius_negative, + spv::DecorationNoContraction); + // Expand the point sprite. + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(2); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + spv::Id point_position_xy = builder_->createBinOp( + spv::OpFAdd, type_float2_, + builder_->createRvalueSwizzle(spv::NoPrecision, type_float2_, + position_xyz, uint_vector_temp_), + builder_->createTriOp(spv::OpSelect, type_float2_, + point_vertex_positive, point_radius, + point_radius_negative)); + builder_->addDecoration(point_position_xy, spv::DecorationNoContraction); + // Store the position. + spv::Id position; + { + // Bypass the `getNumTypeConstituents(typeId) == (int)constituents.size()` + // assertion in createCompositeConstruct, OpCompositeConstruct can + // construct vectors not only from scalars, but also from other vectors. + std::unique_ptr composite_construct_op = + std::make_unique( + builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct); + composite_construct_op->addIdOperand(point_position_xy); + composite_construct_op->addIdOperand( + builder_->createCompositeExtract(position_xyz, type_float_, 2)); + composite_construct_op->addIdOperand(position_w); + position = composite_construct_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(composite_construct_op)); + } + builder_->createStore(position, position_ptr); + + // Write the point coordinates. + if (output_point_coordinates_ != spv::NoResult) { + builder_->createStore( + builder_->createTriOp(spv::OpSelect, type_float2_, + point_vertex_positive, const_float2_1_, + const_float2_0_), + output_point_coordinates_); + } + + // TODO(Triang3l): For points, handle ps_ucp_mode (take the guest clip space + // coordinates instead of the host ones, calculate the distances to the user + // clip planes, cull using the distance from the center for modes 0, 1 and + // 2, cull and clip per-vertex for modes 2 and 3) in clip and cull + // distances. + } else { + // Store the position converted to the host. + spv::Id position; + { + // Bypass the `getNumTypeConstituents(typeId) == (int)constituents.size()` + // assertion in createCompositeConstruct, OpCompositeConstruct can + // construct vectors not only from scalars, but also from other vectors. + std::unique_ptr composite_construct_op = + std::make_unique( + builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct); + composite_construct_op->addIdOperand(position_xyz); + composite_construct_op->addIdOperand(position_w); + position = composite_construct_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(composite_construct_op)); + } + builder_->createStore(position, position_ptr); + } } void SpirvShaderTranslator::StartFragmentShaderBeforeMain() { diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 733bbf2ff..3bcd342a3 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -50,7 +50,11 @@ class SpirvShaderTranslator : public ShaderTranslator { // Interpolators written by the vertex shader and needed by the pixel // shader. uint32_t interpolator_mask : xenos::kMaxInterpolators; - uint32_t output_point_size : 1; + // For HostVertexShaderType kPointListAsTriangleStrip, whether to output + // the point coordinates. + // For other HostVertexShaderTypes (though truly reachable only for + // kVertex), whether to output the point size. + uint32_t output_point_parameters : 1; // Dynamically indexable register count from SQ_PROGRAM_CNTL. uint32_t dynamic_addressable_register_count : 8; // Pipeline stage and input configuration. @@ -655,6 +659,9 @@ class SpirvShaderTranslator : public ShaderTranslator { // all). std::array input_output_interpolators_; + // VS, only for HostVertexShaderType::kPointListAsTriangleStrip when needed + // for the PS - float2. + spv::Id output_point_coordinates_; // VS, only when needed - float. spv::Id output_point_size_; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 80affe639..68a00cbe8 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -2171,7 +2171,9 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, // TODO(Triang3l): Tessellation, geometry-type-specific vertex shader, // vertex shader as compute. if (primitive_processing_result.host_vertex_shader_type != - Shader::HostVertexShaderType::kVertex) { + Shader::HostVertexShaderType::kVertex && + primitive_processing_result.host_vertex_shader_type != + Shader::HostVertexShaderType::kPointListAsTriangleStrip) { return false; } @@ -2179,7 +2181,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, vertex_shader_modification = pipeline_cache_->GetCurrentVertexShaderModification( *vertex_shader, primitive_processing_result.host_vertex_shader_type, - interpolator_mask); + interpolator_mask, ps_param_gen_pos != UINT32_MAX); pixel_shader_modification = pixel_shader ? pipeline_cache_->GetCurrentPixelShaderModification( *pixel_shader, interpolator_mask, ps_param_gen_pos) @@ -2348,6 +2350,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, } const ui::vulkan::VulkanProvider& provider = GetVulkanProvider(); + const VkPhysicalDeviceFeatures& device_features = provider.device_features(); const VkPhysicalDeviceLimits& device_limits = provider.device_properties().limits; @@ -2382,11 +2385,23 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, UpdateDynamicState(viewport_info, primitive_polygonal, normalized_depth_control); + auto vgt_draw_initiator = regs.Get(); + + // Whether to load the guest 32-bit (usually big-endian) vertex index + // indirectly in the vertex shader if full 32-bit indices are not supported by + // the host. + bool shader_32bit_index_dma = + !device_features.fullDrawIndexUint32 && + primitive_processing_result.index_buffer_type == + PrimitiveProcessor::ProcessedIndexBufferType::kGuestDMA && + vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32 && + primitive_processing_result.host_vertex_shader_type == + Shader::HostVertexShaderType::kVertex; + // Update system constants before uploading them. - bool vertex_shader_index_load; UpdateSystemConstantValues(primitive_polygonal, primitive_processing_result, - viewport_info, used_texture_mask, - vertex_shader_index_load); + shader_32bit_index_dma, viewport_info, + used_texture_mask); // Update uniform buffers and descriptor sets after binding the pipeline with // the new layout. @@ -2453,13 +2468,13 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, // Draw. if (primitive_processing_result.index_buffer_type == PrimitiveProcessor::ProcessedIndexBufferType::kNone || - vertex_shader_index_load) { + shader_32bit_index_dma) { deferred_command_buffer_.CmdVkDraw( primitive_processing_result.host_draw_vertex_count, 1, 0, 0); } else { std::pair index_buffer; switch (primitive_processing_result.index_buffer_type) { - case PrimitiveProcessor::ProcessedIndexBufferType::kGuest: + case PrimitiveProcessor::ProcessedIndexBufferType::kGuestDMA: index_buffer.first = shared_memory_->buffer(); index_buffer.second = primitive_processing_result.guest_index_base; break; @@ -2467,7 +2482,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, index_buffer = primitive_processor_->GetConvertedIndexBuffer( primitive_processing_result.host_index_buffer_handle); break; - case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltin: + case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForAuto: + case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForDMA: index_buffer = primitive_processor_->GetBuiltinIndexBuffer( primitive_processing_result.host_index_buffer_handle); break; @@ -3342,8 +3358,8 @@ void VulkanCommandProcessor::UpdateDynamicState( void VulkanCommandProcessor::UpdateSystemConstantValues( bool primitive_polygonal, const PrimitiveProcessor::ProcessingResult& primitive_processing_result, - const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask, - bool& vertex_shader_index_load_out) { + bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info, + uint32_t used_texture_mask) { #if XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES @@ -3367,51 +3383,17 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( // Flags. uint32_t flags = 0; // Vertex index shader loading. - bool vertex_shader_index_load = false; - // Only for ProcessedIndexBufferType kGuest since kHostConverted indices may - // be not loaded into the GPU memory (only read on the CPU), though - // kHostConverted must never be used for point lists and rectangle lists - // without geometry shaders anyway. For regular 32-bit index fetching without - // fullDrawIndexUint32, kHostConverted indices are already byte-swapped and - // truncated to 24 bits, so indirect fetch is not needed. + if (shader_32bit_index_dma) { + flags |= SpirvShaderTranslator::kSysFlag_VertexIndexLoad; + } if (primitive_processing_result.index_buffer_type == - PrimitiveProcessor::ProcessedIndexBufferType::kGuest) { - switch (primitive_processing_result.host_vertex_shader_type) { - case Shader::HostVertexShaderType::kVertex: { - // For guest (usually big-endian) 32-bit indices when they're not - // supported by the device. - if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) { - const ui::vulkan::VulkanProvider& provider = GetVulkanProvider(); - const VkPhysicalDeviceFeatures& device_features = - provider.device_features(); - if (!device_features.fullDrawIndexUint32) { - vertex_shader_index_load = true; - flags |= SpirvShaderTranslator::kSysFlag_VertexIndexLoad; - } - } - } break; - // kMemexportCompute never comes out of the PrimitiveProcessor, as - // memexport compute shaders are executed alongside their vertex - // counterparts, since they may still result in drawing. - case Shader::HostVertexShaderType::kPointListAsTriangleStrip: - case Shader::HostVertexShaderType::kRectangleListAsTriangleStrip: { - // Always loading the guest index buffer indirectly if it's used, as - // host indexing contains a part needed specifically for the host for - // the construction of the primitive - host vertices don't map 1:1 to - // guest ones. - vertex_shader_index_load = true; - flags |= - SpirvShaderTranslator::kSysFlag_ComputeOrPrimitiveVertexIndexLoad; - if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) { - flags |= SpirvShaderTranslator :: - kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit; - } - } break; - default: - break; + PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForDMA) { + flags |= SpirvShaderTranslator::kSysFlag_ComputeOrPrimitiveVertexIndexLoad; + if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) { + flags |= SpirvShaderTranslator :: + kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit; } } - vertex_shader_index_load_out = vertex_shader_index_load; // W0 division control. // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf // 8: VTX_XY_FMT = true: the incoming XY have already been multiplied by 1/W0. @@ -3466,9 +3448,9 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( // Index or tessellation edge factor buffer endianness. dirty |= system_constants_.vertex_index_endian != - primitive_processing_result.host_index_endian; + primitive_processing_result.host_shader_index_endian; system_constants_.vertex_index_endian = - primitive_processing_result.host_index_endian; + primitive_processing_result.host_shader_index_endian; // Vertex index offset. dirty |= system_constants_.vertex_base_index != vgt_indx_offset; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index f500e0718..7920981fb 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -436,8 +436,8 @@ class VulkanCommandProcessor : public CommandProcessor { void UpdateSystemConstantValues( bool primitive_polygonal, const PrimitiveProcessor::ProcessingResult& primitive_processing_result, - const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask, - bool& vertex_shader_index_load_out); + bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info, + uint32_t used_texture_mask); bool UpdateBindings(const VulkanShader* vertex_shader, const VulkanShader* pixel_shader); // Allocates a descriptor set and fills one or two VkWriteDescriptorSet diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc index 7cf30e250..aff800c1a 100644 --- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc @@ -118,7 +118,7 @@ VulkanShader* VulkanPipelineCache::LoadShader(xenos::ShaderType shader_type, SpirvShaderTranslator::Modification VulkanPipelineCache::GetCurrentVertexShaderModification( const Shader& shader, Shader::HostVertexShaderType host_vertex_shader_type, - uint32_t interpolator_mask) const { + uint32_t interpolator_mask, bool ps_param_gen_used) const { assert_true(shader.type() == xenos::ShaderType::kVertex); assert_true(shader.is_ucode_analyzed()); const auto& regs = register_file_; @@ -133,10 +133,15 @@ VulkanPipelineCache::GetCurrentVertexShaderModification( modification.vertex.interpolator_mask = interpolator_mask; - modification.vertex.output_point_size = - uint32_t((shader.writes_point_size_edge_flag_kill_vertex() & 0b001) && - regs.Get().prim_type == - xenos::PrimitiveType::kPointList); + if (host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip) { + modification.vertex.output_point_parameters = uint32_t(ps_param_gen_used); + } else { + modification.vertex.output_point_parameters = + uint32_t((shader.writes_point_size_edge_flag_kill_vertex() & 0b001) && + regs.Get().prim_type == + xenos::PrimitiveType::kPointList); + } return modification; } @@ -828,6 +833,17 @@ bool VulkanPipelineCache::GetGeometryShaderKey( if (geometry_shader_type == PipelineGeometryShader::kNone) { return false; } + // For kPointListAsTriangleStrip, output_point_parameters has a different + // meaning (the coordinates, not the size). However, the AsTriangleStrip host + // vertex shader types are needed specifically when geometry shaders are not + // supported as fallbacks. + if (vertex_shader_modification.vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip || + vertex_shader_modification.vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) { + assert_always(); + return false; + } GeometryShaderKey key; key.type = geometry_shader_type; // TODO(Triang3l): Once all needed inputs and outputs are added, uncomment the @@ -840,7 +856,8 @@ bool VulkanPipelineCache::GetGeometryShaderKey( /* vertex_shader_modification.vertex.user_clip_plane_cull */ 0; key.has_vertex_kill_and = /* vertex_shader_modification.vertex.vertex_kill_and */ 0; - key.has_point_size = vertex_shader_modification.vertex.output_point_size; + key.has_point_size = + vertex_shader_modification.vertex.output_point_parameters; key.has_point_coordinates = pixel_shader_modification.pixel.param_gen_point; key_out = key; return true; diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h index 6e0c73ab0..56346d1bc 100644 --- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h +++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h @@ -71,7 +71,7 @@ class VulkanPipelineCache { SpirvShaderTranslator::Modification GetCurrentVertexShaderModification( const Shader& shader, Shader::HostVertexShaderType host_vertex_shader_type, - uint32_t interpolator_mask) const; + uint32_t interpolator_mask, bool ps_param_gen_used) const; SpirvShaderTranslator::Modification GetCurrentPixelShaderModification( const Shader& shader, uint32_t interpolator_mask, uint32_t param_gen_pos) const; diff --git a/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc b/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc index b7f37f4b9..86b13b4ae 100644 --- a/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc @@ -36,7 +36,9 @@ bool VulkanPrimitiveProcessor::Initialize() { if (!InitializeCommon(device_features.fullDrawIndexUint32, !device_portability_subset_features || device_portability_subset_features->triangleFans, - false, device_features.geometryShader)) { + false, device_features.geometryShader, + device_features.geometryShader, + device_features.geometryShader)) { Shutdown(); return false; } @@ -127,9 +129,9 @@ void VulkanPrimitiveProcessor::EndFrame() { frame_index_buffers_.clear(); } -bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( - uint32_t index_count, std::function fill_callback) { - assert_not_zero(index_count); +bool VulkanPrimitiveProcessor::InitializeBuiltinIndexBuffer( + size_t size_bytes, std::function fill_callback) { + assert_not_zero(size_bytes); assert_true(builtin_index_buffer_ == VK_NULL_HANDLE); assert_true(builtin_index_buffer_memory_ == VK_NULL_HANDLE); assert_true(builtin_index_buffer_upload_ == VK_NULL_HANDLE); @@ -140,7 +142,7 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); VkDevice device = provider.device(); - builtin_index_buffer_size_ = VkDeviceSize(sizeof(uint16_t) * index_count); + builtin_index_buffer_size_ = VkDeviceSize(size_bytes); if (!ui::vulkan::util::CreateDedicatedAllocationBuffer( provider, builtin_index_buffer_size_, VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT, @@ -148,8 +150,8 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( builtin_index_buffer_memory_)) { XELOGE( "Vulkan primitive processor: Failed to create the built-in index " - "buffer GPU resource with {} 16-bit indices", - index_count); + "buffer GPU resource with {} bytes", + size_bytes); return false; } uint32_t upload_memory_type; @@ -161,8 +163,8 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( &upload_memory_type)) { XELOGE( "Vulkan primitive processor: Failed to create the built-in index " - "buffer upload resource with {} 16-bit indices", - index_count); + "buffer upload resource with {} bytes", + size_bytes); ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyBuffer, device, builtin_index_buffer_); ui::vulkan::util::DestroyAndNullHandle(dfn.vkFreeMemory, device, @@ -175,8 +177,8 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( VK_WHOLE_SIZE, 0, &mapping) != VK_SUCCESS) { XELOGE( "Vulkan primitive processor: Failed to map the built-in index buffer " - "upload resource with {} 16-bit indices", - index_count); + "upload resource with {} bytes", + size_bytes); ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyBuffer, device, builtin_index_buffer_upload_); ui::vulkan::util::DestroyAndNullHandle(dfn.vkFreeMemory, device, @@ -187,7 +189,7 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( builtin_index_buffer_memory_); return false; } - fill_callback(reinterpret_cast(mapping)); + fill_callback(mapping); ui::vulkan::util::FlushMappedMemoryRange( provider, builtin_index_buffer_memory_, upload_memory_type); dfn.vkUnmapMemory(device, builtin_index_buffer_upload_memory_); diff --git a/src/xenia/gpu/vulkan/vulkan_primitive_processor.h b/src/xenia/gpu/vulkan/vulkan_primitive_processor.h index 50e729577..ea8ed4fed 100644 --- a/src/xenia/gpu/vulkan/vulkan_primitive_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_primitive_processor.h @@ -56,9 +56,8 @@ class VulkanPrimitiveProcessor final : public PrimitiveProcessor { } protected: - bool InitializeBuiltin16BitIndexBuffer( - uint32_t index_count, - std::function fill_callback) override; + bool InitializeBuiltinIndexBuffer( + size_t size_bytes, std::function fill_callback) override; void* RequestHostConvertedIndexBufferForCurrentFrame( xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,