diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index 69d1c04ac..de9c6c969 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -168,10 +168,13 @@ void SpirvShaderTranslator::StartTranslation() { spv::Id type; }; const SystemConstant system_constants[] = { + {"flags", offsetof(SystemConstants, flags), type_uint_}, {"vertex_index_endian", offsetof(SystemConstants, vertex_index_endian), type_uint_}, {"vertex_base_index", offsetof(SystemConstants, vertex_base_index), type_int_}, + {"ndc_scale", offsetof(SystemConstants, ndc_scale), type_float3_}, + {"ndc_offset", offsetof(SystemConstants, ndc_offset), type_float3_}, }; id_vector_temp_.clear(); id_vector_temp_.reserve(xe::countof(system_constants)); @@ -997,6 +1000,133 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { } void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() { + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeIntConstant(kSystemConstantFlags)); + spv::Id system_constant_flags = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kOutputPerVertexMemberPosition)); + spv::Id position_ptr = builder_->createAccessChain( + spv::StorageClassOutput, output_per_vertex_, id_vector_temp_); + spv::Id guest_position = builder_->createLoad(position_ptr, spv::NoPrecision); + + // Check if the shader already returns W, not 1/W, and if it doesn't, turn 1/W + // into W. + spv::Id position_w = + builder_->createCompositeExtract(guest_position, type_float_, 3); + spv::Id is_w_not_reciprocal = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, system_constant_flags, + builder_->makeUintConstant( + static_cast(kSysFlag_WNotReciprocal))), + const_uint_0_); + spv::Id guest_position_w_inv = builder_->createBinOp( + spv::OpFDiv, type_float_, const_float_1_, position_w); + builder_->addDecoration(guest_position_w_inv, spv::DecorationNoContraction); + position_w = + builder_->createTriOp(spv::OpSelect, type_float_, is_w_not_reciprocal, + position_w, guest_position_w_inv); + + // Check if the shader returns XY/W rather than XY, and if it does, revert + // that. + // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in + // affine interpolation. + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(2); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + spv::Id position_xy = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float2_, guest_position, uint_vector_temp_); + spv::Id is_xy_divided_by_w = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, system_constant_flags, + builder_->makeUintConstant( + static_cast(kSysFlag_XYDividedByW))), + const_uint_0_); + spv::Id guest_position_xy_mul_w = builder_->createBinOp( + spv::OpVectorTimesScalar, type_float2_, position_xy, position_w); + builder_->addDecoration(guest_position_xy_mul_w, + spv::DecorationNoContraction); + position_xy = + builder_->createTriOp(spv::OpSelect, type_float2_, is_xy_divided_by_w, + guest_position_xy_mul_w, position_xy); + + // Check if the shader returns Z/W rather than Z, and if it does, revert that. + // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in + // affine interpolation. + spv::Id position_z = + builder_->createCompositeExtract(guest_position, type_float_, 2); + spv::Id is_z_divided_by_w = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, system_constant_flags, + builder_->makeUintConstant( + static_cast(kSysFlag_ZDividedByW))), + const_uint_0_); + spv::Id guest_position_z_mul_w = + builder_->createBinOp(spv::OpFMul, type_float_, position_z, position_w); + builder_->addDecoration(guest_position_z_mul_w, spv::DecorationNoContraction); + position_z = + builder_->createTriOp(spv::OpSelect, type_float_, is_z_divided_by_w, + guest_position_z_mul_w, position_z); + + // Build XYZ of the position with W format handled. + spv::Id position_xyz; + { + std::unique_ptr composite_construct_op = + std::make_unique( + builder_->getUniqueId(), type_float3_, spv::OpCompositeConstruct); + composite_construct_op->addIdOperand(position_xy); + composite_construct_op->addIdOperand(position_z); + position_xyz = composite_construct_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(composite_construct_op)); + } + + // Apply the NDC scale and offset for guest to host viewport transformation. + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeIntConstant(kSystemConstantNdcScale)); + spv::Id ndc_scale = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + position_xyz = + builder_->createBinOp(spv::OpFMul, type_float3_, position_xyz, ndc_scale); + builder_->addDecoration(position_xyz, spv::DecorationNoContraction); + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantNdcOffset)); + spv::Id ndc_offset = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + spv::Id ndc_offset_mul_w = builder_->createBinOp( + spv::OpVectorTimesScalar, type_float3_, ndc_offset, position_w); + builder_->addDecoration(ndc_offset_mul_w, spv::DecorationNoContraction); + position_xyz = builder_->createBinOp(spv::OpFAdd, type_float3_, position_xyz, + ndc_offset_mul_w); + builder_->addDecoration(position_xyz, spv::DecorationNoContraction); + + // Store the position converted to the host. + spv::Id position; + { + std::unique_ptr composite_construct_op = + std::make_unique( + builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct); + composite_construct_op->addIdOperand(position_xyz); + composite_construct_op->addIdOperand(position_w); + position = composite_construct_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(composite_construct_op)); + } + builder_->createStore(position, position_ptr); + // Write 1 to point size (using a geometry shader or another kind of fallback // to expand point sprites - point size support is not guaranteed, and the // size would also be limited, and can't be controlled independently along two diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index d4c32dda2..fadcf2a6b 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -25,12 +25,33 @@ namespace gpu { class SpirvShaderTranslator : public ShaderTranslator { public: + enum : uint32_t { + kSysFlag_XYDividedByW_Shift, + kSysFlag_ZDividedByW_Shift, + kSysFlag_WNotReciprocal_Shift, + + kSysFlag_Count, + + kSysFlag_XYDividedByW = 1u << kSysFlag_XYDividedByW_Shift, + kSysFlag_ZDividedByW = 1u << kSysFlag_ZDividedByW_Shift, + kSysFlag_WNotReciprocal = 1u << kSysFlag_WNotReciprocal_Shift, + }; + static_assert(kSysFlag_Count <= 32, "Too many flags in the system constants"); + // IF SYSTEM CONSTANTS ARE CHANGED OR ADDED, THE FOLLOWING MUST BE UPDATED: // - SystemConstantIndex enum. // - Structure members in BeginTranslation. struct SystemConstants { + uint32_t flags; xenos::Endian vertex_index_endian; int32_t vertex_base_index; + uint32_t padding_vertex_base_index; + + float ndc_scale[3]; + uint32_t padding_ndc_scale; + + float ndc_offset[3]; + uint32_t padding_ndc_offset; }; // The minimum limit for maxPerStageDescriptorStorageBuffers is 4, and for @@ -329,8 +350,11 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id const_float2_0_1_; enum SystemConstantIndex : unsigned int { + kSystemConstantFlags, kSystemConstantIndexVertexIndexEndian, kSystemConstantIndexVertexBaseIndex, + kSystemConstantNdcScale, + kSystemConstantNdcOffset, }; spv::Id uniform_system_constants_; spv::Id uniform_float_constants_; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 6b5a51006..9b4d598f1 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -686,14 +686,45 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, current_graphics_pipeline_layout_ = pipeline_layout; } + const RegisterFile& regs = *register_file_; + const ui::vulkan::VulkanProvider& provider = + GetVulkanContext().GetVulkanProvider(); + const VkPhysicalDeviceProperties& device_properties = + provider.device_properties(); + + // Get dynamic rasterizer state. + draw_util::ViewportInfo viewport_info; + // Just handling maxViewportDimensions is enough - viewportBoundsRange[1] must + // be at least 2 * max(maxViewportDimensions[0...1]) - 1, and + // maxViewportDimensions must be greater than or equal to the size of the + // largest possible framebuffer attachment (if the viewport has positive + // offset and is between maxViewportDimensions and viewportBoundsRange[1], + // GetHostViewportInfo will adjust ndc_scale/ndc_offset to clamp it, and the + // clamped range will be outside the largest possible framebuffer anyway. + // TODO(Triang3l): Possibly handle maxViewportDimensions and + // viewportBoundsRange separately because when using fragment shader + // interlocks, framebuffers are not used, while the range may be wider than + // dimensions? Though viewport bigger than 4096 - the smallest possible + // maximum dimension (which is below the 8192 texture size limit on the Xbox + // 360) - and with offset, is probably a situation that never happens in real + // life. Or even disregard the viewport bounds range in the fragment shader + // interlocks case completely - apply the viewport and the scissor offset + // directly to pixel address and to things like ps_param_gen. + draw_util::GetHostViewportInfo( + regs, 1.0f, 1.0f, false, + float(device_properties.limits.maxViewportDimensions[0]), + float(device_properties.limits.maxViewportDimensions[1]), true, + viewport_info); + // Update fixed-function dynamic state. - UpdateFixedFunctionState(); + UpdateFixedFunctionState(viewport_info); bool indexed = index_buffer_info != nullptr && index_buffer_info->guest_base; // Update system constants before uploading them. - UpdateSystemConstantValues(indexed ? index_buffer_info->endianness - : xenos::Endian::kNone); + UpdateSystemConstantValues( + indexed ? index_buffer_info->endianness : xenos::Endian::kNone, + viewport_info); // Update uniform buffers and descriptor sets after binding the pipeline with // the new layout. @@ -701,8 +732,6 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, return false; } - const RegisterFile& regs = *register_file_; - // Ensure vertex buffers are resident. // TODO(Triang3l): Cache residency for ranges in a way similar to how texture // validity is tracked. @@ -1229,7 +1258,8 @@ VkShaderStageFlags VulkanCommandProcessor::GetGuestVertexShaderStageFlags() return stages; } -void VulkanCommandProcessor::UpdateFixedFunctionState() { +void VulkanCommandProcessor::UpdateFixedFunctionState( + const draw_util::ViewportInfo& viewport_info) { #if XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES @@ -1245,53 +1275,13 @@ void VulkanCommandProcessor::UpdateFixedFunctionState() { uint32_t pixel_size_x = 1, pixel_size_y = 1; // Viewport. - // PA_CL_VTE_CNTL contains whether offsets and scales are enabled. - // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf - // In games, either all are enabled (for regular drawing) or none are (for - // rectangle lists usually). - // - // If scale/offset is enabled, the Xenos shader is writing (neglecting W - // division) position in the NDC (-1, -1, dx_clip_space_def - 1) -> (1, 1, 1) - // box. If it's not, the position is in screen space. Since we can only use - // the NDC in PC APIs, we use a viewport of the largest possible size, and - // divide the position by it in translated shaders. - // - // TODO(Triang3l): Move all of this to draw_util. - // TODO(Triang3l): Limit the viewport if exceeding the device limit; move to - // NDC scale/offset constants. - auto pa_cl_vte_cntl = regs.Get(); - float viewport_scale_x = - pa_cl_vte_cntl.vport_x_scale_ena - ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32) - : 4096.0f; - float viewport_scale_y = - pa_cl_vte_cntl.vport_y_scale_ena - ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32) - : 4096.0f; - float viewport_scale_z = pa_cl_vte_cntl.vport_z_scale_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 - : 1.0f; - float viewport_offset_x = pa_cl_vte_cntl.vport_x_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 - : std::abs(viewport_scale_x); - float viewport_offset_y = pa_cl_vte_cntl.vport_y_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 - : std::abs(viewport_scale_y); - float viewport_offset_z = pa_cl_vte_cntl.vport_z_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 - : 0.0f; - if (regs.Get().vtx_window_offset_enable) { - viewport_offset_x += float(pa_sc_window_offset.window_x_offset); - viewport_offset_y += float(pa_sc_window_offset.window_y_offset); - } VkViewport viewport; - viewport.x = (viewport_offset_x - viewport_scale_x) * float(pixel_size_x); - viewport.y = (viewport_offset_y - viewport_scale_y) * float(pixel_size_y); - viewport.width = viewport_scale_x * 2.0f * float(pixel_size_x); - viewport.height = viewport_scale_y * 2.0f * float(pixel_size_y); - viewport.minDepth = std::min(std::max(viewport_offset_z, 0.0f), 1.0f); - viewport.maxDepth = - std::min(std::max(viewport_offset_z + viewport_scale_z, 0.0f), 1.0f); + viewport.x = viewport_info.left; + viewport.y = viewport_info.top; + viewport.width = viewport_info.width; + viewport.height = viewport_info.height; + viewport.minDepth = viewport_info.z_min; + viewport.maxDepth = viewport_info.z_max; ff_viewport_update_needed_ |= ff_viewport_.x != viewport.x; ff_viewport_update_needed_ |= ff_viewport_.y != viewport.y; ff_viewport_update_needed_ |= ff_viewport_.width != viewport.width; @@ -1326,16 +1316,39 @@ void VulkanCommandProcessor::UpdateFixedFunctionState() { } void VulkanCommandProcessor::UpdateSystemConstantValues( - xenos::Endian index_endian) { + xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info) { #if XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES const RegisterFile& regs = *register_file_; + auto pa_cl_vte_cntl = regs.Get(); int32_t vgt_indx_offset = int32_t(regs[XE_GPU_REG_VGT_INDX_OFFSET].u32); bool dirty = false; + // Flags. + uint32_t flags = 0; + // W0 division control. + // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf + // 8: VTX_XY_FMT = true: the incoming XY have already been multiplied by 1/W0. + // = false: multiply the X, Y coordinates by 1/W0. + // 9: VTX_Z_FMT = true: the incoming Z has already been multiplied by 1/W0. + // = false: multiply the Z coordinate by 1/W0. + // 10: VTX_W0_FMT = true: the incoming W0 is not 1/W0. Perform the reciprocal + // to get 1/W0. + if (pa_cl_vte_cntl.vtx_xy_fmt) { + flags |= SpirvShaderTranslator::kSysFlag_XYDividedByW; + } + if (pa_cl_vte_cntl.vtx_z_fmt) { + flags |= SpirvShaderTranslator::kSysFlag_ZDividedByW; + } + if (pa_cl_vte_cntl.vtx_w0_fmt) { + flags |= SpirvShaderTranslator::kSysFlag_WNotReciprocal; + } + dirty |= system_constants_.flags != flags; + system_constants_.flags = flags; + // Index or tessellation edge factor buffer endianness. dirty |= system_constants_.vertex_index_endian != index_endian; system_constants_.vertex_index_endian = index_endian; @@ -1344,6 +1357,14 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( dirty |= system_constants_.vertex_base_index != vgt_indx_offset; system_constants_.vertex_base_index = vgt_indx_offset; + // Conversion to host normalized device coordinates. + for (uint32_t i = 0; i < 3; ++i) { + dirty |= system_constants_.ndc_scale[i] != viewport_info.ndc_scale[i]; + dirty |= system_constants_.ndc_offset[i] != viewport_info.ndc_offset[i]; + system_constants_.ndc_scale[i] = viewport_info.ndc_scale[i]; + system_constants_.ndc_offset[i] = viewport_info.ndc_offset[i]; + } + if (dirty) { current_graphics_descriptor_set_values_up_to_date_ &= ~(uint32_t(1) << SpirvShaderTranslator::kDescriptorSetSystemConstants); diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index a7283d56f..e083b3755 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -19,6 +19,7 @@ #include #include "xenia/gpu/command_processor.h" +#include "xenia/gpu/draw_util.h" #include "xenia/gpu/spirv_shader_translator.h" #include "xenia/gpu/vulkan/deferred_command_buffer.h" #include "xenia/gpu/vulkan/vulkan_graphics_system.h" @@ -170,8 +171,9 @@ class VulkanCommandProcessor : public CommandProcessor { VkShaderStageFlags GetGuestVertexShaderStageFlags() const; - void UpdateFixedFunctionState(); - void UpdateSystemConstantValues(xenos::Endian index_endian); + void UpdateFixedFunctionState(const draw_util::ViewportInfo& viewport_info); + void UpdateSystemConstantValues(xenos::Endian index_endian, + const draw_util::ViewportInfo& viewport_info); bool UpdateBindings(const VulkanShader* vertex_shader, const VulkanShader* pixel_shader); // Allocates a descriptor, space in the uniform buffer pool, and fills the