From 7595cdb52bd12d448aeabe4908862f59d283ce9d Mon Sep 17 00:00:00 2001 From: Triang3l Date: Wed, 27 Jul 2022 17:14:28 +0300 Subject: [PATCH] [Vulkan] Non-GS point sprites + minor SPIR-V fixes --- .../gpu/d3d12/d3d12_command_processor.cc | 7 +- .../gpu/d3d12/d3d12_primitive_processor.cc | 25 +- .../gpu/d3d12/d3d12_primitive_processor.h | 5 +- src/xenia/gpu/dxbc_shader_translator.cc | 4 - src/xenia/gpu/primitive_processor.cc | 250 +++++++-- src/xenia/gpu/primitive_processor.h | 57 ++- src/xenia/gpu/spirv_shader_translator.cc | 482 ++++++++++++++---- src/xenia/gpu/spirv_shader_translator.h | 9 +- .../gpu/vulkan/vulkan_command_processor.cc | 90 ++-- .../gpu/vulkan/vulkan_command_processor.h | 4 +- src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc | 29 +- src/xenia/gpu/vulkan/vulkan_pipeline_cache.h | 2 +- .../gpu/vulkan/vulkan_primitive_processor.cc | 26 +- .../gpu/vulkan/vulkan_primitive_processor.h | 5 +- 14 files changed, 721 insertions(+), 274 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index b6f72ff9b..129f89fd0 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -2268,7 +2268,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, UpdateSystemConstantValues( memexport_used, primitive_polygonal, primitive_processing_result.line_loop_closing_index, - primitive_processing_result.host_index_endian, viewport_info, + primitive_processing_result.host_shader_index_endian, viewport_info, used_texture_mask, normalized_depth_control, normalized_color_mask); // Update constant buffers, descriptors and root parameters. @@ -2513,7 +2513,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, } ID3D12Resource* scratch_index_buffer = nullptr; switch (primitive_processing_result.index_buffer_type) { - case PrimitiveProcessor::ProcessedIndexBufferType::kGuest: { + case PrimitiveProcessor::ProcessedIndexBufferType::kGuestDMA: { if (memexport_used) { // If the shared memory is a UAV, it can't be used as an index buffer // (UAV is a read/write state, index buffer is a read-only state). @@ -2545,7 +2545,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, primitive_processor_->GetConvertedIndexBufferGpuAddress( primitive_processing_result.host_index_buffer_handle); break; - case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltin: + case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForAuto: + case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForDMA: index_buffer_view.BufferLocation = primitive_processor_->GetBuiltinIndexBufferGpuAddress( primitive_processing_result.host_index_buffer_handle); diff --git a/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc b/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc index a806546a1..03e67d9ac 100644 --- a/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc @@ -28,7 +28,7 @@ namespace d3d12 { D3D12PrimitiveProcessor::~D3D12PrimitiveProcessor() { Shutdown(true); } bool D3D12PrimitiveProcessor::Initialize() { - if (!InitializeCommon(true, false, false, true)) { + if (!InitializeCommon(true, false, false, true, true, true)) { Shutdown(); return false; } @@ -83,9 +83,9 @@ void D3D12PrimitiveProcessor::EndFrame() { frame_index_buffers_.clear(); } -bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( - uint32_t index_count, std::function fill_callback) { - assert_not_zero(index_count); +bool D3D12PrimitiveProcessor::InitializeBuiltinIndexBuffer( + size_t size_bytes, std::function fill_callback) { + assert_not_zero(size_bytes); assert_null(builtin_index_buffer_); assert_null(builtin_index_buffer_upload_); @@ -94,9 +94,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( ID3D12Device* device = provider.GetDevice(); D3D12_RESOURCE_DESC resource_desc; - ui::d3d12::util::FillBufferResourceDesc( - resource_desc, UINT64(sizeof(uint16_t) * index_count), - D3D12_RESOURCE_FLAG_NONE); + ui::d3d12::util::FillBufferResourceDesc(resource_desc, UINT64(size_bytes), + D3D12_RESOURCE_FLAG_NONE); Microsoft::WRL::ComPtr draw_resource; if (FAILED(device->CreateCommittedResource( &ui::d3d12::util::kHeapPropertiesDefault, @@ -105,8 +104,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( IID_PPV_ARGS(&draw_resource)))) { XELOGE( "D3D12 primitive processor: Failed to create the built-in index " - "buffer GPU resource with {} 16-bit indices", - index_count); + "buffer GPU resource with {} bytes", + size_bytes); return false; } Microsoft::WRL::ComPtr upload_resource; @@ -117,8 +116,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( IID_PPV_ARGS(&upload_resource)))) { XELOGE( "D3D12 primitive processor: Failed to create the built-in index " - "buffer upload resource with {} 16-bit indices", - index_count); + "buffer upload resource with {} bytes", + size_bytes); return false; } @@ -127,8 +126,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( if (FAILED(upload_resource->Map(0, &upload_read_range, &mapping))) { XELOGE( "D3D12 primitive processor: Failed to map the built-in index buffer " - "upload resource with {} 16-bit indices", - index_count); + "upload resource with {} bytes", + size_bytes); return false; } fill_callback(reinterpret_cast(mapping)); diff --git a/src/xenia/gpu/d3d12/d3d12_primitive_processor.h b/src/xenia/gpu/d3d12/d3d12_primitive_processor.h index 81e1812a6..8ac02f4db 100644 --- a/src/xenia/gpu/d3d12/d3d12_primitive_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_primitive_processor.h @@ -56,9 +56,8 @@ class D3D12PrimitiveProcessor final : public PrimitiveProcessor { } protected: - bool InitializeBuiltin16BitIndexBuffer( - uint32_t index_count, - std::function fill_callback) override; + bool InitializeBuiltinIndexBuffer( + size_t size_bytes, std::function fill_callback) override; void* RequestHostConvertedIndexBufferForCurrentFrame( xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd, diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 602da9ce8..daa8cf782 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -964,8 +964,6 @@ void DxbcShaderTranslator::CompleteVertexOrDomainShader() { // Check if the shader returns XY/W rather than XY, and if it does, revert // that. - // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in - // affine interpolation. a_.OpAnd(temp_x_dest, flags_src, dxbc::Src::LU(kSysFlag_XYDividedByW)); a_.OpIf(true, temp_x_src); a_.OpMul(dxbc::Dest::R(system_temp_position_, 0b0011), @@ -974,8 +972,6 @@ void DxbcShaderTranslator::CompleteVertexOrDomainShader() { a_.OpEndIf(); // Check if the shader returns Z/W rather than Z, and if it does, revert that. - // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in - // affine interpolation. a_.OpAnd(temp_x_dest, flags_src, dxbc::Src::LU(kSysFlag_ZDividedByW)); a_.OpIf(true, temp_x_src); a_.OpMul(dxbc::Dest::R(system_temp_position_, 0b0100), diff --git a/src/xenia/gpu/primitive_processor.cc b/src/xenia/gpu/primitive_processor.cc index 68da6d100..827fb7b4e 100644 --- a/src/xenia/gpu/primitive_processor.cc +++ b/src/xenia/gpu/primitive_processor.cc @@ -9,6 +9,7 @@ #include "xenia/gpu/primitive_processor.h" +#include #include #include #include @@ -106,7 +107,9 @@ PrimitiveProcessor::~PrimitiveProcessor() { ShutdownCommon(); } bool PrimitiveProcessor::InitializeCommon( bool full_32bit_vertex_indices_supported, bool triangle_fans_supported, - bool line_loops_supported, bool quad_lists_supported) { + bool line_loops_supported, bool quad_lists_supported, + bool point_sprites_supported_without_vs_expansion, + bool rectangle_lists_supported_without_vs_expansion) { full_32bit_vertex_indices_used_ = full_32bit_vertex_indices_supported; convert_triangle_fans_to_lists_ = !triangle_fans_supported || cvars::force_convert_triangle_fans_to_lists; @@ -115,33 +118,94 @@ bool PrimitiveProcessor::InitializeCommon( convert_quad_lists_to_triangle_lists_ = !quad_lists_supported || cvars::force_convert_quad_lists_to_triangle_lists; + // No override cvars as hosts are not required to support the fallback paths + // since they require different vertex shader structure (for the fallback + // HostVertexShaderTypes). + expand_point_sprites_in_vs_ = !point_sprites_supported_without_vs_expansion; + expand_rectangle_lists_in_vs_ = + !rectangle_lists_supported_without_vs_expansion; // Initialize the index buffer for conversion of auto-indexed primitive types. - uint32_t builtin_index_count = 0; + size_t builtin_index_buffer_size = 0; + // 32-bit, before 16-bit due to alignment (for primitive expansion - when the + // indices encode not only the guest vertex index, but also a part needed for + // host expansion, thus may contain values above UINT16_MAX, such as up to + // (UINT16_MAX - 1) * 4 + 3 for point sprites). + // Using an index buffer for point sprite and rectangle list expansion instead + // of instancing as how instancing is implemented may vary wildly between + // GPUs, potentially slowly (like no different instances in the same + // wavefront) with small vertex counts per instance. Also using triangle + // strips with primitive restart, not triangle lists, so the vertex shader may + // be invoked once for the inner edge vertices, which is important for memory + // export in guest shaders, not to write to the same location from two + // invocations. + uint32_t builtin_ib_two_triangle_strip_count = 0; + if (expand_point_sprites_in_vs_) { + builtin_ib_two_triangle_strip_count = + std::max(uint32_t(UINT16_MAX), builtin_ib_two_triangle_strip_count); + } + if (expand_rectangle_lists_in_vs_) { + builtin_ib_two_triangle_strip_count = + std::max(uint32_t(UINT16_MAX / 3), builtin_ib_two_triangle_strip_count); + } + if (builtin_ib_two_triangle_strip_count) { + builtin_ib_offset_two_triangle_strips_ = builtin_index_buffer_size; + builtin_index_buffer_size += + sizeof(uint32_t) * + GetTwoTriangleStripIndexCount(builtin_ib_two_triangle_strip_count); + } else { + builtin_ib_offset_two_triangle_strips_ = SIZE_MAX; + } + // 16-bit (for indirection on top of single auto-indexed vertices) - enough + // even if the backend has primitive reset enabled all the time (Metal) as + // auto-indexed draws are limited to UINT16_MAX vertices, not UINT16_MAX + 1. if (convert_triangle_fans_to_lists_) { - builtin_ib_offset_triangle_fans_to_lists_ = - sizeof(uint16_t) * builtin_index_count; - builtin_index_count += GetTriangleFanListIndexCount(UINT16_MAX); + builtin_ib_offset_triangle_fans_to_lists_ = builtin_index_buffer_size; + builtin_index_buffer_size += + sizeof(uint16_t) * GetTriangleFanListIndexCount(UINT16_MAX); } else { builtin_ib_offset_triangle_fans_to_lists_ = SIZE_MAX; } if (convert_quad_lists_to_triangle_lists_) { - builtin_ib_offset_quad_lists_to_triangle_lists_ = - sizeof(uint16_t) * builtin_index_count; - builtin_index_count += GetQuadListTriangleListIndexCount(UINT16_MAX); + builtin_ib_offset_quad_lists_to_triangle_lists_ = builtin_index_buffer_size; + builtin_index_buffer_size += + sizeof(uint16_t) * GetQuadListTriangleListIndexCount(UINT16_MAX); } else { builtin_ib_offset_quad_lists_to_triangle_lists_ = SIZE_MAX; } - if (builtin_index_count) { - if (!InitializeBuiltin16BitIndexBuffer( - builtin_index_count, [this](uint16_t* mapping) { + if (builtin_index_buffer_size) { + if (!InitializeBuiltinIndexBuffer( + builtin_index_buffer_size, + [this, builtin_ib_two_triangle_strip_count](void* mapping) { + uint32_t* mapping_32bit = reinterpret_cast(mapping); + if (builtin_ib_offset_two_triangle_strips_ != SIZE_MAX) { + // Two-triangle strips. + uint32_t* two_triangle_strip_ptr = + mapping_32bit + + builtin_ib_offset_two_triangle_strips_ / sizeof(uint32_t); + for (uint32_t i = 0; i < builtin_ib_two_triangle_strip_count; + ++i) { + if (i) { + // Primitive restart. + *(two_triangle_strip_ptr++) = UINT32_MAX; + } + // Host vertex index within the pair in the lower 2 bits, + // guest primitive index in the rest. + uint32_t two_triangle_strip_first_index = i << 2; + for (uint32_t j = 0; j < 4; ++j) { + *(two_triangle_strip_ptr++) = + two_triangle_strip_first_index + j; + } + } + } + uint16_t* mapping_16bit = reinterpret_cast(mapping); if (builtin_ib_offset_triangle_fans_to_lists_ != SIZE_MAX) { // Triangle fans as triangle lists. // Ordered as (v1, v2, v0), (v2, v3, v0) in Direct3D. // https://docs.microsoft.com/en-us/windows/desktop/direct3d9/triangle-fans uint16_t* triangle_list_ptr = - mapping + builtin_ib_offset_triangle_fans_to_lists_ / - sizeof(uint16_t); + mapping_16bit + builtin_ib_offset_triangle_fans_to_lists_ / + sizeof(uint16_t); for (uint32_t i = 2; i < UINT16_MAX; ++i) { *(triangle_list_ptr++) = uint16_t(i - 1); *(triangle_list_ptr++) = uint16_t(i); @@ -150,8 +214,9 @@ bool PrimitiveProcessor::InitializeCommon( } if (builtin_ib_offset_quad_lists_to_triangle_lists_ != SIZE_MAX) { uint16_t* triangle_list_ptr = - mapping + builtin_ib_offset_quad_lists_to_triangle_lists_ / - sizeof(uint16_t); + mapping_16bit + + builtin_ib_offset_quad_lists_to_triangle_lists_ / + sizeof(uint16_t); // TODO(Triang3l): SIMD for faster initialization? for (uint32_t i = 0; i < UINT16_MAX / 4; ++i) { uint16_t quad_first_index = uint16_t(i * 4); @@ -309,15 +374,27 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { return false; } } else { + host_vertex_shader_type = Shader::HostVertexShaderType::kVertex; switch (guest_primitive_type) { case xenos::PrimitiveType::kPointList: + if (expand_point_sprites_in_vs_) { + host_primitive_type = xenos::PrimitiveType::kTriangleStrip; + host_vertex_shader_type = + Shader::HostVertexShaderType::kPointListAsTriangleStrip; + } + break; case xenos::PrimitiveType::kLineList: case xenos::PrimitiveType::kLineStrip: case xenos::PrimitiveType::kTriangleList: case xenos::PrimitiveType::kTriangleStrip: + // Supported natively on all backends. + break; case xenos::PrimitiveType::kRectangleList: - // Supported natively or through geometry or compute shaders on all - // backends. + if (expand_rectangle_lists_in_vs_) { + host_primitive_type = xenos::PrimitiveType::kTriangleStrip; + host_vertex_shader_type = + Shader::HostVertexShaderType::kRectangleListAsTriangleStrip; + } break; case xenos::PrimitiveType::kTriangleFan: if (convert_triangle_fans_to_lists_) { @@ -342,7 +419,6 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { assert_always(); return false; } - host_vertex_shader_type = Shader::HostVertexShaderType::kVertex; } // Process the indices. @@ -359,12 +435,86 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { guest_draw_vertex_count = vgt_dma_size.num_words; } uint32_t line_loop_closing_index = 0; - uint32_t guest_index_base; + uint32_t guest_index_base = 0, guest_index_buffer_needed_bytes = 0; CachedResult cacheable; cacheable.host_draw_vertex_count = guest_draw_vertex_count; cacheable.host_primitive_reset_enabled = false; cacheable.host_index_buffer_handle = SIZE_MAX; - if (vgt_draw_initiator.source_select == xenos::SourceSelect::kAutoIndex) { + if (host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip || + host_vertex_shader_type == + Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) { + // As two-triangle strips, with guest indices being either autogenerated or + // fetched via DMA. + uint32_t primitive_count = guest_draw_vertex_count; + if (host_vertex_shader_type == + Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) { + primitive_count /= 3; + } + cacheable.host_draw_vertex_count = + GetTwoTriangleStripIndexCount(primitive_count); + cacheable.host_index_format = xenos::IndexFormat::kInt32; + cacheable.host_primitive_reset_enabled = true; + assert_true(builtin_ib_offset_two_triangle_strips_ != SIZE_MAX); + cacheable.host_index_buffer_handle = builtin_ib_offset_two_triangle_strips_; + if (vgt_draw_initiator.source_select == xenos::SourceSelect::kAutoIndex) { + cacheable.index_buffer_type = + ProcessedIndexBufferType::kHostBuiltinForAuto; + cacheable.host_shader_index_endian = xenos::Endian::kNone; + } else { + // There is an index buffer. + assert_true(vgt_draw_initiator.source_select == + xenos::SourceSelect::kDMA); + if (vgt_draw_initiator.source_select != xenos::SourceSelect::kDMA) { + // TODO(Triang3l): Support immediate-indexed vertices. + XELOGE( + "Primitive processor: Unsupported vertex index source {}. Report " + "the game to Xenia developers!", + uint32_t(vgt_draw_initiator.source_select)); + return false; + } + xenos::IndexFormat guest_index_format = vgt_draw_initiator.index_size; + // Normalize the endian. + cacheable.index_buffer_type = + ProcessedIndexBufferType::kHostBuiltinForDMA; + xenos::Endian guest_index_endian = vgt_dma_size.swap_mode; + if (guest_index_format == xenos::IndexFormat::kInt16 && + (guest_index_endian != xenos::Endian::kNone && + guest_index_endian != xenos::Endian::k8in16)) { + XELOGW( + "Primitive processor: 32-bit endian swap mode {} is used for " + "16-bit indices. This shouldn't normally be happening, but report " + "the game to Xenia developers for investigation of the intended " + "behavior (ignore or actually swap across adjacent indices)! " + "Currently disabling the swap for 16-and-32 and replacing 8-in-32 " + "with 8-in-16.", + uint32_t(guest_index_endian)); + guest_index_endian = guest_index_endian == xenos::Endian::k8in32 + ? xenos::Endian::k8in16 + : xenos::Endian::kNone; + } + cacheable.host_shader_index_endian = guest_index_endian; + // Get the index buffer memory range. + uint32_t index_size_log2 = + guest_index_format == xenos::IndexFormat::kInt16 ? 1 : 2; + // The base should already be aligned, but aligning here too for safety. + guest_index_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32 & + ~uint32_t((1 << index_size_log2) - 1); + guest_index_buffer_needed_bytes = guest_draw_vertex_count + << index_size_log2; + if (guest_index_base > SharedMemory::kBufferSize || + SharedMemory::kBufferSize - guest_index_base < + guest_index_buffer_needed_bytes) { + XELOGE( + "Primitive processor: Index buffer at 0x{:08X}, 0x{:X} bytes " + "required, is out of the physical memory bounds", + guest_index_base, guest_index_buffer_needed_bytes); + assert_always(); + return false; + } + } + } else if (vgt_draw_initiator.source_select == + xenos::SourceSelect::kAutoIndex) { // Auto-indexed - use a remapping index buffer if needed to change the // primitive type. if (tessellation_enabled && @@ -376,9 +526,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { assert_always(); return false; } - guest_index_base = 0; cacheable.host_index_format = xenos::IndexFormat::kInt16; - cacheable.host_index_endian = xenos::Endian::kNone; + cacheable.host_shader_index_endian = xenos::Endian::kNone; cacheable.host_primitive_reset_enabled = false; cacheable.index_buffer_type = ProcessedIndexBufferType::kNone; if (host_primitive_type != guest_primitive_type) { @@ -388,7 +537,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { xenos::PrimitiveType::kTriangleList); cacheable.host_draw_vertex_count = GetTriangleFanListIndexCount(cacheable.host_draw_vertex_count); - cacheable.index_buffer_type = ProcessedIndexBufferType::kHostBuiltin; + cacheable.index_buffer_type = + ProcessedIndexBufferType::kHostBuiltinForAuto; assert_true(builtin_ib_offset_triangle_fans_to_lists_ != SIZE_MAX); cacheable.host_index_buffer_handle = builtin_ib_offset_triangle_fans_to_lists_; @@ -409,7 +559,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { xenos::PrimitiveType::kTriangleList); cacheable.host_draw_vertex_count = GetQuadListTriangleListIndexCount( cacheable.host_draw_vertex_count); - cacheable.index_buffer_type = ProcessedIndexBufferType::kHostBuiltin; + cacheable.index_buffer_type = + ProcessedIndexBufferType::kHostBuiltinForAuto; assert_true(builtin_ib_offset_quad_lists_to_triangle_lists_ != SIZE_MAX); cacheable.host_index_buffer_handle = @@ -503,8 +654,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { // The base should already be aligned, but aligning here too for safety. guest_index_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32 & ~uint32_t((1 << index_size_log2) - 1); - uint32_t guest_index_buffer_needed_bytes = guest_draw_vertex_count - << index_size_log2; + guest_index_buffer_needed_bytes = guest_draw_vertex_count + << index_size_log2; if (guest_index_base > SharedMemory::kBufferSize || SharedMemory::kBufferSize - guest_index_base < guest_index_buffer_needed_bytes) { @@ -517,7 +668,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { } cacheable.host_index_format = guest_index_format; - cacheable.host_index_endian = guest_index_endian; + cacheable.host_shader_index_endian = guest_index_endian; uint32_t guest_index_mask_guest_endian = guest_index_format == xenos::IndexFormat::kInt16 ? UINT16_MAX @@ -666,7 +817,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { assert_unhandled_case(guest_index_endian); return false; } - cacheable.host_index_endian = xenos::Endian::kNone; + cacheable.host_shader_index_endian = xenos::Endian::kNone; } } cache_transaction.SetNewResult(cacheable); @@ -677,7 +828,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { // endian-swap, or even to safely drop the upper 8 bits if no swap is even // needed) indirectly. cacheable.host_draw_vertex_count = guest_draw_vertex_count; - cacheable.index_buffer_type = ProcessedIndexBufferType::kGuest; + cacheable.index_buffer_type = ProcessedIndexBufferType::kGuestDMA; cacheable.host_primitive_reset_enabled = guest_primitive_reset_enabled; if (guest_primitive_reset_enabled) { if (guest_index_format == xenos::IndexFormat::kInt16) { @@ -742,8 +893,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { } else { // Low 24 bits of the guest index are compared to the primitive reset // index. If the backend doesn't support full 32-bit indices, for - // ProcessedIndexBufferType::kGuest, the host needs to read the buffer - // indirectly in the vertex shaders and swap, and for + // ProcessedIndexBufferType::kGuestDMA, the host needs to read the + // buffer indirectly in the vertex shaders and swap, and for // ProcessedIndexBufferType::kHostConverted (if primitive reset is // actually used, thus exactly 0xFFFFFFFF must be sent to the host for // it in a true index buffer), no indirection is done, but @@ -800,26 +951,31 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { assert_unhandled_case(guest_index_endian); return false; } - cacheable.host_index_endian = full_32bit_vertex_indices_used_ - ? guest_index_endian - : xenos::Endian::kNone; + cacheable.host_shader_index_endian = + full_32bit_vertex_indices_used_ ? guest_index_endian + : xenos::Endian::kNone; } cache_transaction.SetNewResult(cacheable); } } } - if (cacheable.index_buffer_type == ProcessedIndexBufferType::kGuest) { - // Request the index buffer memory. - // TODO(Triang3l): Shared memory request cache. - if (!shared_memory_.RequestRange(guest_index_base, - guest_index_buffer_needed_bytes)) { - XELOGE( - "PrimitiveProcessor: Failed to request index buffer 0x{:08X}, " - "0x{:X} bytes needed, in the shared memory", - guest_index_base, guest_index_buffer_needed_bytes); - return false; - } - } + } + } + + // Request the indices in the shared memory if they need to be accessed from + // there on the GPU. + if (cacheable.index_buffer_type == ProcessedIndexBufferType::kGuestDMA || + cacheable.index_buffer_type == + ProcessedIndexBufferType::kHostBuiltinForDMA) { + // Request the index buffer memory. + // TODO(Triang3l): Shared memory request cache. + if (!shared_memory_.RequestRange(guest_index_base, + guest_index_buffer_needed_bytes)) { + XELOGE( + "PrimitiveProcessor: Failed to request index buffer 0x{:08X}, 0x{:X} " + "bytes needed, in the shared memory", + guest_index_base, guest_index_buffer_needed_bytes); + return false; } } @@ -832,7 +988,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { result_out.index_buffer_type = cacheable.index_buffer_type; result_out.guest_index_base = guest_index_base; result_out.host_index_format = cacheable.host_index_format; - result_out.host_index_endian = cacheable.host_index_endian; + result_out.host_shader_index_endian = cacheable.host_shader_index_endian; result_out.host_primitive_reset_enabled = cacheable.host_primitive_reset_enabled; result_out.host_index_buffer_handle = cacheable.host_index_buffer_handle; diff --git a/src/xenia/gpu/primitive_processor.h b/src/xenia/gpu/primitive_processor.h index cfbec0ae9..6a77a3d0f 100644 --- a/src/xenia/gpu/primitive_processor.h +++ b/src/xenia/gpu/primitive_processor.h @@ -10,6 +10,7 @@ #ifndef XENIA_GPU_PRIMITIVE_PROCESSOR_H_ #define XENIA_GPU_PRIMITIVE_PROCESSOR_H_ +#include #include #include #include @@ -110,13 +111,16 @@ class PrimitiveProcessor { // For 32-bit, indirection is needed if the host only supports 24-bit // indices (even for non-endian-swapped, as the GPU should be ignoring the // upper 8 bits completely, rather than exhibiting undefined behavior. - kGuest, + kGuestDMA, // Converted and stored in the primitive converter for the current draw // command. For 32-bit indices, if the host doesn't support all 32 bits, // this kind of an index buffer will always be pre-masked and pre-swapped. kHostConverted, // Auto-indexed on the guest, but with an adapter index buffer on the host. - kHostBuiltin, + kHostBuiltinForAuto, + // Adapter index buffer on the host for indirect loading of indices via DMA + // (from the shared memory). + kHostBuiltinForDMA, }; struct ProcessingResult { @@ -136,13 +140,14 @@ class PrimitiveProcessor { ProcessedIndexBufferType index_buffer_type; uint32_t guest_index_base; xenos::IndexFormat host_index_format; - xenos::Endian host_index_endian; + xenos::Endian host_shader_index_endian; // The reset index, if enabled, is always 0xFFFF for host_index_format // kInt16 and 0xFFFFFFFF for kInt32. Never enabled for "list" primitive // types, thus safe for direct usage on Vulkan. bool host_primitive_reset_enabled; // Backend-specific handle for the index buffer valid for the current draw, - // only valid for index_buffer_type kHostConverted and kHostBuiltin. + // only valid for index_buffer_type kHostConverted, kHostBuiltinForAuto and + // kHostBuiltinForDMA. size_t host_index_buffer_handle; bool IsTessellated() const { return Shader::IsHostVertexShaderTypeDomain(host_vertex_shader_type); @@ -165,6 +170,12 @@ class PrimitiveProcessor { bool IsConvertingQuadListsToTriangleLists() const { return convert_quad_lists_to_triangle_lists_; } + bool IsExpandingPointSpritesInVS() const { + return expand_point_sprites_in_vs_; + } + bool IsExpandingRectangleListsInVS() const { + return expand_rectangle_lists_in_vs_; + } // Submission must be open to call (may request the index buffer in the shared // memory). @@ -217,8 +228,8 @@ class PrimitiveProcessor { // if indirection may be needed. // - When full 32-bit indices are not supported, the host must be using // auto-indexed draws for 32-bit indices of ProcessedIndexBufferType - // kGuest, while fetching the index data manually from the shared memory - // buffer and endian-swapping it. + // kGuestDMA, while fetching the index data manually from the shared + // memory buffer and endian-swapping it. // - Indirection, however, precludes primitive reset usage - so if // primitive reset is needed, the primitive processor will pre-swap and // pre-mask the index buffer so there are only host-endian 0x00###### or @@ -235,19 +246,26 @@ class PrimitiveProcessor { // those guest primitive types directly or through geometry shader // emulation. Debug overriding will be resolved in the common code if // needed. + // - point_sprites_supported_without_vs_expansion, + // rectangle_lists_supported_without_vs_expansion: + // - Pass true or false depending on whether the host actually supports + // those guest primitive types directly or through geometry shader + // emulation. Overrides do not apply to these as hosts are not required to + // support the fallback paths since they require different vertex shader + // structure (for the fallback HostVertexShaderTypes). bool InitializeCommon(bool full_32bit_vertex_indices_supported, bool triangle_fans_supported, bool line_loops_supported, - bool quad_lists_supported); + bool quad_lists_supported, + bool point_sprites_supported_without_vs_expansion, + bool rectangle_lists_supported_without_vs_expansion); // If any primitive type conversion is needed for auto-indexed draws, called // from InitializeCommon (thus only once in the primitive processor's // lifetime) to set up the backend's index buffer containing indices for - // primitive type remapping. The backend must allocate a `sizeof(uint16_t) * - // index_count` buffer and call fill_callback for its mapping if creation is - // successful. 16-bit indices are enough even if the backend has primitive - // reset enabled all the time (Metal) as auto-indexed draws are limited to - // UINT16_MAX vertices, not UINT16_MAX + 1. - virtual bool InitializeBuiltin16BitIndexBuffer( - uint32_t index_count, std::function fill_callback) = 0; + // primitive type remapping. The backend must allocate a 4-byte-aligned buffer + // with `size_bytes` and call fill_callback for its mapping if creation has + // been successful. + virtual bool InitializeBuiltinIndexBuffer( + size_t size_bytes, std::function fill_callback) = 0; // Call last in implementation-specific shutdown, also callable from the // destructor. void ShutdownCommon(); @@ -509,6 +527,12 @@ class PrimitiveProcessor { } }; + static constexpr uint32_t GetTwoTriangleStripIndexCount( + uint32_t strip_count) { + // 4 vertices per strip, and primitive restarts between strips. + return 4 * strip_count + (std::max(strip_count, UINT32_C(1)) - 1); + } + // Triangle fan test cases: // - 4D5307E6 - main menu - game logo, developer logo, backgrounds of the menu // item list (the whole menu and individual items) - no index buffer. @@ -675,8 +699,11 @@ class PrimitiveProcessor { bool convert_triangle_fans_to_lists_ = false; bool convert_line_loops_to_strips_ = false; bool convert_quad_lists_to_triangle_lists_ = false; + bool expand_point_sprites_in_vs_ = false; + bool expand_rectangle_lists_in_vs_ = false; // Byte offsets used, for simplicity, directly as handles. + size_t builtin_ib_offset_two_triangle_strips_ = SIZE_MAX; size_t builtin_ib_offset_triangle_fans_to_lists_ = SIZE_MAX; size_t builtin_ib_offset_quad_lists_to_triangle_lists_ = SIZE_MAX; @@ -745,7 +772,7 @@ class PrimitiveProcessor { uint32_t host_draw_vertex_count; ProcessedIndexBufferType index_buffer_type; xenos::IndexFormat host_index_format; - xenos::Endian host_index_endian; + xenos::Endian host_shader_index_endian; bool host_primitive_reset_enabled; size_t host_index_buffer_handle; }; diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index 199d0f99c..bb89e0d41 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -111,6 +111,7 @@ void SpirvShaderTranslator::Reset() { input_front_facing_ = spv::NoResult; std::fill(input_output_interpolators_.begin(), input_output_interpolators_.end(), spv::NoResult); + output_point_coordinates_ = spv::NoResult; output_point_size_ = spv::NoResult; sampler_bindings_.clear(); @@ -1097,18 +1098,33 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderBeforeMain() { Modification shader_modification = GetSpirvShaderModification(); - // Create the point size output. Not using gl_PointSize from gl_PerVertex not - // to rely on the shaderTessellationAndGeometryPointSize feature, and also - // because the value written to gl_PointSize must be greater than zero. - if (shader_modification.vertex.output_point_size) { - output_point_size_ = - builder_->createVariable(spv::NoPrecision, spv::StorageClassOutput, - type_float_, "xe_out_point_size"); - builder_->addDecoration(output_point_size_, spv::DecorationLocation, - int(output_location)); - builder_->addDecoration(output_point_size_, spv::DecorationInvariant); - main_interface_.push_back(output_point_size_); - ++output_location; + if (shader_modification.vertex.output_point_parameters) { + if (shader_modification.vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip) { + // Create the point coordinates output. + output_point_coordinates_ = + builder_->createVariable(spv::NoPrecision, spv::StorageClassOutput, + type_float2_, "xe_out_point_coordinates"); + builder_->addDecoration(output_point_coordinates_, + spv::DecorationLocation, int(output_location)); + builder_->addDecoration(output_point_coordinates_, + spv::DecorationInvariant); + main_interface_.push_back(output_point_coordinates_); + ++output_location; + } else { + // Create the point size output. Not using gl_PointSize from gl_PerVertex + // not to rely on the shaderTessellationAndGeometryPointSize feature, and + // also because the value written to gl_PointSize must be greater than + // zero. + output_point_size_ = + builder_->createVariable(spv::NoPrecision, spv::StorageClassOutput, + type_float_, "xe_out_point_size"); + builder_->addDecoration(output_point_size_, spv::DecorationLocation, + int(output_location)); + builder_->addDecoration(output_point_size_, spv::DecorationInvariant); + main_interface_.push_back(output_point_size_); + ++output_location; + } } // Create the gl_PerVertex output for used system outputs. @@ -1172,24 +1188,33 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { } } + Modification shader_modification = GetSpirvShaderModification(); + + // TODO(Triang3l): For HostVertexShaderType::kRectangeListAsTriangleStrip, + // start the vertex loop, and load the index there. + // Load the vertex index or the tessellation parameters. if (register_count()) { // TODO(Triang3l): Barycentric coordinates and patch index. if (IsSpirvVertexShader()) { - // TODO(Triang3l): Close line loop primitive. - // Load the unswapped index as uint for swapping, or for indirect loading - // if needed. spv::Id vertex_index = builder_->createUnaryOp( spv::OpBitcast, type_uint_, builder_->createLoad(input_vertex_index_, spv::NoPrecision)); - if (!features_.full_draw_index_uint32) { - // Check if the full 32-bit index needs to be loaded indirectly. + if (shader_modification.vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip) { + // Load the point index, autogenerated or indirectly from the index + // buffer. + // Extract the primitive index from the two-triangle strip vertex index. + spv::Id const_uint_2 = builder_->makeUintConstant(2); + vertex_index = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, vertex_index, const_uint_2); + // Check if the index needs to be loaded from the index buffer. spv::Id load_vertex_index = builder_->createBinOp( spv::OpINotEqual, type_bool_, builder_->createBinOp( spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, - builder_->makeUintConstant( - static_cast(kSysFlag_VertexIndexLoad))), + builder_->makeUintConstant(static_cast( + kSysFlag_ComputeOrPrimitiveVertexIndexLoad))), const_uint_0_); spv::Block& block_load_vertex_index_pre = *builder_->getBuildPoint(); spv::Block& block_load_vertex_index_start = builder_->makeNewBlock(); @@ -1200,25 +1225,61 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { &block_load_vertex_index_start, &block_load_vertex_index_merge); builder_->setBuildPoint(&block_load_vertex_index_start); - // Load the 32-bit index. - // TODO(Triang3l): Bounds checking. + // Check if the index is 32-bit. + spv::Id vertex_index_is_32bit = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(static_cast( + kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit))), + const_uint_0_); + // Calculate the vertex index address in the shared memory. id_vector_temp_.clear(); id_vector_temp_.push_back( builder_->makeIntConstant(kSystemConstantVertexIndexLoadAddress)); + spv::Id vertex_index_address = builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision), + builder_->createBinOp( + spv::OpShiftLeftLogical, type_uint_, vertex_index, + builder_->createTriOp(spv::OpSelect, type_uint_, + vertex_index_is_32bit, const_uint_2, + builder_->makeUintConstant(1)))); + // Load the 32 bits containing the whole vertex index or two 16-bit + // vertex indices. + // TODO(Triang3l): Bounds checking. spv::Id loaded_vertex_index = LoadUint32FromSharedMemory(builder_->createUnaryOp( spv::OpBitcast, type_int_, + builder_->createBinOp(spv::OpShiftRightLogical, type_uint_, + vertex_index_address, const_uint_2))); + // Extract the 16-bit index from the loaded 32 bits if needed. + loaded_vertex_index = builder_->createTriOp( + spv::OpSelect, type_uint_, vertex_index_is_32bit, + loaded_vertex_index, + builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, loaded_vertex_index, builder_->createBinOp( - spv::OpIAdd, type_uint_, - builder_->createBinOp( - spv::OpShiftRightLogical, type_uint_, - builder_->createLoad( - builder_->createAccessChain( - spv::StorageClassUniform, - uniform_system_constants_, id_vector_temp_), - spv::NoPrecision), - builder_->makeUintConstant(2)), - vertex_index))); + spv::OpShiftLeftLogical, type_uint_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + vertex_index_address, const_uint_2), + builder_->makeUintConstant(4 - 1)), + builder_->makeUintConstant(16))); + // Endian-swap the loaded index. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantVertexIndexEndian)); + loaded_vertex_index = EndianSwap32Uint( + loaded_vertex_index, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision)); // Get the actual build point for phi. spv::Block& block_load_vertex_index_end = *builder_->getBuildPoint(); builder_->createBranch(&block_load_vertex_index_merge); @@ -1238,19 +1299,81 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { builder_->getBuildPoint()->addInstruction( std::move(loaded_vertex_index_phi_op)); } + } else { + // TODO(Triang3l): Close line loop primitive. + // Load the unswapped index as uint for swapping, or for indirect + // loading if needed. + if (!features_.full_draw_index_uint32) { + // Check if the full 32-bit index needs to be loaded indirectly. + spv::Id load_vertex_index = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant( + static_cast(kSysFlag_VertexIndexLoad))), + const_uint_0_); + spv::Block& block_load_vertex_index_pre = *builder_->getBuildPoint(); + spv::Block& block_load_vertex_index_start = builder_->makeNewBlock(); + spv::Block& block_load_vertex_index_merge = builder_->makeNewBlock(); + SpirvCreateSelectionMerge(block_load_vertex_index_merge.getId(), + spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch(load_vertex_index, + &block_load_vertex_index_start, + &block_load_vertex_index_merge); + builder_->setBuildPoint(&block_load_vertex_index_start); + // Load the 32-bit index. + // TODO(Triang3l): Bounds checking. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantVertexIndexLoadAddress)); + spv::Id loaded_vertex_index = + LoadUint32FromSharedMemory(builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, + builder_->createLoad( + builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision), + builder_->makeUintConstant(2)), + vertex_index))); + // Get the actual build point for phi. + spv::Block& block_load_vertex_index_end = *builder_->getBuildPoint(); + builder_->createBranch(&block_load_vertex_index_merge); + // Select between the loaded index and the original index from Vulkan. + builder_->setBuildPoint(&block_load_vertex_index_merge); + { + std::unique_ptr loaded_vertex_index_phi_op = + std::make_unique(builder_->getUniqueId(), + type_uint_, spv::OpPhi); + loaded_vertex_index_phi_op->addIdOperand(loaded_vertex_index); + loaded_vertex_index_phi_op->addIdOperand( + block_load_vertex_index_end.getId()); + loaded_vertex_index_phi_op->addIdOperand(vertex_index); + loaded_vertex_index_phi_op->addIdOperand( + block_load_vertex_index_pre.getId()); + vertex_index = loaded_vertex_index_phi_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(loaded_vertex_index_phi_op)); + } + } + // Endian-swap the index. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantVertexIndexEndian)); + vertex_index = EndianSwap32Uint( + vertex_index, builder_->createLoad( + builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision)); } - // Endian-swap the index and convert to int. - id_vector_temp_.clear(); - id_vector_temp_.push_back( - builder_->makeIntConstant(kSystemConstantVertexIndexEndian)); - spv::Id vertex_index_endian = - builder_->createLoad(builder_->createAccessChain( - spv::StorageClassUniform, - uniform_system_constants_, id_vector_temp_), - spv::NoPrecision); - vertex_index = builder_->createUnaryOp( - spv::OpBitcast, type_int_, - EndianSwap32Uint(vertex_index, vertex_index_endian)); + // Convert the index to a signed integer. + vertex_index = + builder_->createUnaryOp(spv::OpBitcast, type_int_, vertex_index); // Add the base to the index. id_vector_temp_.clear(); id_vector_temp_.push_back( @@ -1301,61 +1424,66 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() { builder_->createTriOp(spv::OpSelect, type_float_, is_w_not_reciprocal, position_w, guest_position_w_inv); - // Check if the shader returns XY/W rather than XY, and if it does, revert - // that. - // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in - // affine interpolation. - uint_vector_temp_.clear(); - uint_vector_temp_.reserve(2); - uint_vector_temp_.push_back(0); - uint_vector_temp_.push_back(1); - spv::Id position_xy = builder_->createRvalueSwizzle( - spv::NoPrecision, type_float2_, guest_position, uint_vector_temp_); - spv::Id is_xy_divided_by_w = builder_->createBinOp( - spv::OpINotEqual, type_bool_, - builder_->createBinOp( - spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, - builder_->makeUintConstant( - static_cast(kSysFlag_XYDividedByW))), - const_uint_0_); - spv::Id guest_position_xy_mul_w = builder_->createBinOp( - spv::OpVectorTimesScalar, type_float2_, position_xy, position_w); - builder_->addDecoration(guest_position_xy_mul_w, - spv::DecorationNoContraction); - position_xy = - builder_->createTriOp(spv::OpSelect, type_float2_, is_xy_divided_by_w, - guest_position_xy_mul_w, position_xy); - - // Check if the shader returns Z/W rather than Z, and if it does, revert that. - // TODO(Triang3l): Check if having XY or Z pre-divided by W should result in - // affine interpolation. - spv::Id position_z = - builder_->createCompositeExtract(guest_position, type_float_, 2); - spv::Id is_z_divided_by_w = builder_->createBinOp( - spv::OpINotEqual, type_bool_, - builder_->createBinOp( - spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, - builder_->makeUintConstant( - static_cast(kSysFlag_ZDividedByW))), - const_uint_0_); - spv::Id guest_position_z_mul_w = - builder_->createBinOp(spv::OpFMul, type_float_, position_z, position_w); - builder_->addDecoration(guest_position_z_mul_w, spv::DecorationNoContraction); - position_z = - builder_->createTriOp(spv::OpSelect, type_float_, is_z_divided_by_w, - guest_position_z_mul_w, position_z); - - // Build XYZ of the position with W format handled. spv::Id position_xyz; + + // Open a scope since position_xy and position_z won't be synchronized anymore + // after position_xyz is built and modified later. { - std::unique_ptr composite_construct_op = - std::make_unique( - builder_->getUniqueId(), type_float3_, spv::OpCompositeConstruct); - composite_construct_op->addIdOperand(position_xy); - composite_construct_op->addIdOperand(position_z); - position_xyz = composite_construct_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(composite_construct_op)); + // Check if the shader returns XY/W rather than XY, and if it does, revert + // that. + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(2); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + spv::Id position_xy = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float2_, guest_position, uint_vector_temp_); + spv::Id is_xy_divided_by_w = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant( + static_cast(kSysFlag_XYDividedByW))), + const_uint_0_); + spv::Id guest_position_xy_mul_w = builder_->createBinOp( + spv::OpVectorTimesScalar, type_float2_, position_xy, position_w); + builder_->addDecoration(guest_position_xy_mul_w, + spv::DecorationNoContraction); + position_xy = builder_->createTriOp( + spv::OpSelect, type_float2_, + builder_->smearScalar(spv::NoPrecision, is_xy_divided_by_w, + type_bool2_), + guest_position_xy_mul_w, position_xy); + + // Check if the shader returns Z/W rather than Z, and if it does, revert + // that. + spv::Id position_z = + builder_->createCompositeExtract(guest_position, type_float_, 2); + spv::Id is_z_divided_by_w = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant( + static_cast(kSysFlag_ZDividedByW))), + const_uint_0_); + spv::Id guest_position_z_mul_w = + builder_->createBinOp(spv::OpFMul, type_float_, position_z, position_w); + builder_->addDecoration(guest_position_z_mul_w, + spv::DecorationNoContraction); + position_z = + builder_->createTriOp(spv::OpSelect, type_float_, is_z_divided_by_w, + guest_position_z_mul_w, position_z); + + // Build XYZ of the position with W format handled. + { + std::unique_ptr composite_construct_op = + std::make_unique( + builder_->getUniqueId(), type_float3_, spv::OpCompositeConstruct); + composite_construct_op->addIdOperand(position_xy); + composite_construct_op->addIdOperand(position_z); + position_xyz = composite_construct_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(composite_construct_op)); + } } // Apply the NDC scale and offset for guest to host viewport transformation. @@ -1382,20 +1510,6 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() { ndc_offset_mul_w); builder_->addDecoration(position_xyz, spv::DecorationNoContraction); - // Store the position converted to the host. - spv::Id position; - { - std::unique_ptr composite_construct_op = - std::make_unique( - builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct); - composite_construct_op->addIdOperand(position_xyz); - composite_construct_op->addIdOperand(position_w); - position = composite_construct_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(composite_construct_op)); - } - builder_->createStore(position, position_ptr); - // Write the point size. if (output_point_size_ != spv::NoResult) { spv::Id point_size; @@ -1415,6 +1529,154 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() { } builder_->createStore(point_size, output_point_size_); } + + Modification shader_modification = GetSpirvShaderModification(); + + // Expand the point sprite. + if (shader_modification.vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip) { + // Top-left, bottom-left, top-right, bottom-right order (chosen arbitrarily, + // simply based on counterclockwise meaning front with + // frontFace = VkFrontFace(0), but faceness is ignored for non-polygon + // primitive types). + id_vector_temp_.clear(); + id_vector_temp_.reserve(2); + id_vector_temp_.push_back(builder_->makeUintConstant(0b10)); + id_vector_temp_.push_back(builder_->makeUintConstant(0b01)); + spv::Id point_vertex_positive = builder_->createBinOp( + spv::OpINotEqual, type_bool2_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint2_, + builder_->smearScalar(spv::NoPrecision, + builder_->createUnaryOp( + spv::OpBitcast, type_uint_, + builder_->createLoad(input_vertex_index_, + spv::NoPrecision)), + type_uint2_), + builder_->createCompositeConstruct(type_uint2_, id_vector_temp_)), + SpirvSmearScalarResultOrConstant(const_uint_0_, type_uint2_)); + + // Load the point diameter in guest pixels, with the override from the + // vertex shader if provided. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantPointConstantDiameter)); + spv::Id point_guest_diameter = builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision); + if (current_shader().writes_point_size_edge_flag_kill_vertex() & 0b001) { + assert_true(var_main_point_size_edge_flag_kill_vertex_ != spv::NoResult); + id_vector_temp_.clear(); + id_vector_temp_.push_back(const_int_0_); + spv::Id point_vertex_diameter = builder_->createLoad( + builder_->createAccessChain( + spv::StorageClassFunction, + var_main_point_size_edge_flag_kill_vertex_, id_vector_temp_), + spv::NoPrecision); + // The vertex shader's header writes -1.0 to point_size by default, so any + // non-negative value means that it was overwritten by the translated + // vertex shader, and needs to be used instead of the constant size. The + // per-vertex diameter has already been clamped earlier in translation + // (combined with making it non-negative). + point_guest_diameter = builder_->createTriOp( + spv::OpSelect, type_float2_, + builder_->smearScalar( + spv::NoPrecision, + builder_->createBinOp(spv::OpFOrdGreaterThanEqual, type_bool_, + point_vertex_diameter, const_float_0_), + type_bool2_), + builder_->smearScalar(spv::NoPrecision, point_vertex_diameter, + type_float2_), + point_guest_diameter); + } + // Transform the diameter in the guest screen coordinates to radius in the + // normalized device coordinates. + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeIntConstant( + kSystemConstantPointScreenDiameterToNdcRadius)); + spv::Id point_radius = builder_->createBinOp( + spv::OpFMul, type_float2_, point_guest_diameter, + builder_->createLoad(builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision)); + builder_->addDecoration(point_radius, spv::DecorationNoContraction); + // Transform the radius from the normalized device coordinates to the clip + // space. + point_radius = builder_->createBinOp(spv::OpVectorTimesScalar, type_float2_, + point_radius, position_w); + builder_->addDecoration(point_radius, spv::DecorationNoContraction); + + // Apply the direction of expansion for the current host vertex. + spv::Id point_radius_negative = + builder_->createUnaryOp(spv::OpFNegate, type_float2_, point_radius); + builder_->addDecoration(point_radius_negative, + spv::DecorationNoContraction); + // Expand the point sprite. + uint_vector_temp_.clear(); + uint_vector_temp_.reserve(2); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + spv::Id point_position_xy = builder_->createBinOp( + spv::OpFAdd, type_float2_, + builder_->createRvalueSwizzle(spv::NoPrecision, type_float2_, + position_xyz, uint_vector_temp_), + builder_->createTriOp(spv::OpSelect, type_float2_, + point_vertex_positive, point_radius, + point_radius_negative)); + builder_->addDecoration(point_position_xy, spv::DecorationNoContraction); + // Store the position. + spv::Id position; + { + // Bypass the `getNumTypeConstituents(typeId) == (int)constituents.size()` + // assertion in createCompositeConstruct, OpCompositeConstruct can + // construct vectors not only from scalars, but also from other vectors. + std::unique_ptr composite_construct_op = + std::make_unique( + builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct); + composite_construct_op->addIdOperand(point_position_xy); + composite_construct_op->addIdOperand( + builder_->createCompositeExtract(position_xyz, type_float_, 2)); + composite_construct_op->addIdOperand(position_w); + position = composite_construct_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(composite_construct_op)); + } + builder_->createStore(position, position_ptr); + + // Write the point coordinates. + if (output_point_coordinates_ != spv::NoResult) { + builder_->createStore( + builder_->createTriOp(spv::OpSelect, type_float2_, + point_vertex_positive, const_float2_1_, + const_float2_0_), + output_point_coordinates_); + } + + // TODO(Triang3l): For points, handle ps_ucp_mode (take the guest clip space + // coordinates instead of the host ones, calculate the distances to the user + // clip planes, cull using the distance from the center for modes 0, 1 and + // 2, cull and clip per-vertex for modes 2 and 3) in clip and cull + // distances. + } else { + // Store the position converted to the host. + spv::Id position; + { + // Bypass the `getNumTypeConstituents(typeId) == (int)constituents.size()` + // assertion in createCompositeConstruct, OpCompositeConstruct can + // construct vectors not only from scalars, but also from other vectors. + std::unique_ptr composite_construct_op = + std::make_unique( + builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct); + composite_construct_op->addIdOperand(position_xyz); + composite_construct_op->addIdOperand(position_w); + position = composite_construct_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(composite_construct_op)); + } + builder_->createStore(position, position_ptr); + } } void SpirvShaderTranslator::StartFragmentShaderBeforeMain() { diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 733bbf2ff..3bcd342a3 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -50,7 +50,11 @@ class SpirvShaderTranslator : public ShaderTranslator { // Interpolators written by the vertex shader and needed by the pixel // shader. uint32_t interpolator_mask : xenos::kMaxInterpolators; - uint32_t output_point_size : 1; + // For HostVertexShaderType kPointListAsTriangleStrip, whether to output + // the point coordinates. + // For other HostVertexShaderTypes (though truly reachable only for + // kVertex), whether to output the point size. + uint32_t output_point_parameters : 1; // Dynamically indexable register count from SQ_PROGRAM_CNTL. uint32_t dynamic_addressable_register_count : 8; // Pipeline stage and input configuration. @@ -655,6 +659,9 @@ class SpirvShaderTranslator : public ShaderTranslator { // all). std::array input_output_interpolators_; + // VS, only for HostVertexShaderType::kPointListAsTriangleStrip when needed + // for the PS - float2. + spv::Id output_point_coordinates_; // VS, only when needed - float. spv::Id output_point_size_; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 80affe639..68a00cbe8 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -2171,7 +2171,9 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, // TODO(Triang3l): Tessellation, geometry-type-specific vertex shader, // vertex shader as compute. if (primitive_processing_result.host_vertex_shader_type != - Shader::HostVertexShaderType::kVertex) { + Shader::HostVertexShaderType::kVertex && + primitive_processing_result.host_vertex_shader_type != + Shader::HostVertexShaderType::kPointListAsTriangleStrip) { return false; } @@ -2179,7 +2181,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, vertex_shader_modification = pipeline_cache_->GetCurrentVertexShaderModification( *vertex_shader, primitive_processing_result.host_vertex_shader_type, - interpolator_mask); + interpolator_mask, ps_param_gen_pos != UINT32_MAX); pixel_shader_modification = pixel_shader ? pipeline_cache_->GetCurrentPixelShaderModification( *pixel_shader, interpolator_mask, ps_param_gen_pos) @@ -2348,6 +2350,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, } const ui::vulkan::VulkanProvider& provider = GetVulkanProvider(); + const VkPhysicalDeviceFeatures& device_features = provider.device_features(); const VkPhysicalDeviceLimits& device_limits = provider.device_properties().limits; @@ -2382,11 +2385,23 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, UpdateDynamicState(viewport_info, primitive_polygonal, normalized_depth_control); + auto vgt_draw_initiator = regs.Get(); + + // Whether to load the guest 32-bit (usually big-endian) vertex index + // indirectly in the vertex shader if full 32-bit indices are not supported by + // the host. + bool shader_32bit_index_dma = + !device_features.fullDrawIndexUint32 && + primitive_processing_result.index_buffer_type == + PrimitiveProcessor::ProcessedIndexBufferType::kGuestDMA && + vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32 && + primitive_processing_result.host_vertex_shader_type == + Shader::HostVertexShaderType::kVertex; + // Update system constants before uploading them. - bool vertex_shader_index_load; UpdateSystemConstantValues(primitive_polygonal, primitive_processing_result, - viewport_info, used_texture_mask, - vertex_shader_index_load); + shader_32bit_index_dma, viewport_info, + used_texture_mask); // Update uniform buffers and descriptor sets after binding the pipeline with // the new layout. @@ -2453,13 +2468,13 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, // Draw. if (primitive_processing_result.index_buffer_type == PrimitiveProcessor::ProcessedIndexBufferType::kNone || - vertex_shader_index_load) { + shader_32bit_index_dma) { deferred_command_buffer_.CmdVkDraw( primitive_processing_result.host_draw_vertex_count, 1, 0, 0); } else { std::pair index_buffer; switch (primitive_processing_result.index_buffer_type) { - case PrimitiveProcessor::ProcessedIndexBufferType::kGuest: + case PrimitiveProcessor::ProcessedIndexBufferType::kGuestDMA: index_buffer.first = shared_memory_->buffer(); index_buffer.second = primitive_processing_result.guest_index_base; break; @@ -2467,7 +2482,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, index_buffer = primitive_processor_->GetConvertedIndexBuffer( primitive_processing_result.host_index_buffer_handle); break; - case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltin: + case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForAuto: + case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForDMA: index_buffer = primitive_processor_->GetBuiltinIndexBuffer( primitive_processing_result.host_index_buffer_handle); break; @@ -3342,8 +3358,8 @@ void VulkanCommandProcessor::UpdateDynamicState( void VulkanCommandProcessor::UpdateSystemConstantValues( bool primitive_polygonal, const PrimitiveProcessor::ProcessingResult& primitive_processing_result, - const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask, - bool& vertex_shader_index_load_out) { + bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info, + uint32_t used_texture_mask) { #if XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES @@ -3367,51 +3383,17 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( // Flags. uint32_t flags = 0; // Vertex index shader loading. - bool vertex_shader_index_load = false; - // Only for ProcessedIndexBufferType kGuest since kHostConverted indices may - // be not loaded into the GPU memory (only read on the CPU), though - // kHostConverted must never be used for point lists and rectangle lists - // without geometry shaders anyway. For regular 32-bit index fetching without - // fullDrawIndexUint32, kHostConverted indices are already byte-swapped and - // truncated to 24 bits, so indirect fetch is not needed. + if (shader_32bit_index_dma) { + flags |= SpirvShaderTranslator::kSysFlag_VertexIndexLoad; + } if (primitive_processing_result.index_buffer_type == - PrimitiveProcessor::ProcessedIndexBufferType::kGuest) { - switch (primitive_processing_result.host_vertex_shader_type) { - case Shader::HostVertexShaderType::kVertex: { - // For guest (usually big-endian) 32-bit indices when they're not - // supported by the device. - if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) { - const ui::vulkan::VulkanProvider& provider = GetVulkanProvider(); - const VkPhysicalDeviceFeatures& device_features = - provider.device_features(); - if (!device_features.fullDrawIndexUint32) { - vertex_shader_index_load = true; - flags |= SpirvShaderTranslator::kSysFlag_VertexIndexLoad; - } - } - } break; - // kMemexportCompute never comes out of the PrimitiveProcessor, as - // memexport compute shaders are executed alongside their vertex - // counterparts, since they may still result in drawing. - case Shader::HostVertexShaderType::kPointListAsTriangleStrip: - case Shader::HostVertexShaderType::kRectangleListAsTriangleStrip: { - // Always loading the guest index buffer indirectly if it's used, as - // host indexing contains a part needed specifically for the host for - // the construction of the primitive - host vertices don't map 1:1 to - // guest ones. - vertex_shader_index_load = true; - flags |= - SpirvShaderTranslator::kSysFlag_ComputeOrPrimitiveVertexIndexLoad; - if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) { - flags |= SpirvShaderTranslator :: - kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit; - } - } break; - default: - break; + PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForDMA) { + flags |= SpirvShaderTranslator::kSysFlag_ComputeOrPrimitiveVertexIndexLoad; + if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) { + flags |= SpirvShaderTranslator :: + kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit; } } - vertex_shader_index_load_out = vertex_shader_index_load; // W0 division control. // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf // 8: VTX_XY_FMT = true: the incoming XY have already been multiplied by 1/W0. @@ -3466,9 +3448,9 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( // Index or tessellation edge factor buffer endianness. dirty |= system_constants_.vertex_index_endian != - primitive_processing_result.host_index_endian; + primitive_processing_result.host_shader_index_endian; system_constants_.vertex_index_endian = - primitive_processing_result.host_index_endian; + primitive_processing_result.host_shader_index_endian; // Vertex index offset. dirty |= system_constants_.vertex_base_index != vgt_indx_offset; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index f500e0718..7920981fb 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -436,8 +436,8 @@ class VulkanCommandProcessor : public CommandProcessor { void UpdateSystemConstantValues( bool primitive_polygonal, const PrimitiveProcessor::ProcessingResult& primitive_processing_result, - const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask, - bool& vertex_shader_index_load_out); + bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info, + uint32_t used_texture_mask); bool UpdateBindings(const VulkanShader* vertex_shader, const VulkanShader* pixel_shader); // Allocates a descriptor set and fills one or two VkWriteDescriptorSet diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc index 7cf30e250..aff800c1a 100644 --- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc @@ -118,7 +118,7 @@ VulkanShader* VulkanPipelineCache::LoadShader(xenos::ShaderType shader_type, SpirvShaderTranslator::Modification VulkanPipelineCache::GetCurrentVertexShaderModification( const Shader& shader, Shader::HostVertexShaderType host_vertex_shader_type, - uint32_t interpolator_mask) const { + uint32_t interpolator_mask, bool ps_param_gen_used) const { assert_true(shader.type() == xenos::ShaderType::kVertex); assert_true(shader.is_ucode_analyzed()); const auto& regs = register_file_; @@ -133,10 +133,15 @@ VulkanPipelineCache::GetCurrentVertexShaderModification( modification.vertex.interpolator_mask = interpolator_mask; - modification.vertex.output_point_size = - uint32_t((shader.writes_point_size_edge_flag_kill_vertex() & 0b001) && - regs.Get().prim_type == - xenos::PrimitiveType::kPointList); + if (host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip) { + modification.vertex.output_point_parameters = uint32_t(ps_param_gen_used); + } else { + modification.vertex.output_point_parameters = + uint32_t((shader.writes_point_size_edge_flag_kill_vertex() & 0b001) && + regs.Get().prim_type == + xenos::PrimitiveType::kPointList); + } return modification; } @@ -828,6 +833,17 @@ bool VulkanPipelineCache::GetGeometryShaderKey( if (geometry_shader_type == PipelineGeometryShader::kNone) { return false; } + // For kPointListAsTriangleStrip, output_point_parameters has a different + // meaning (the coordinates, not the size). However, the AsTriangleStrip host + // vertex shader types are needed specifically when geometry shaders are not + // supported as fallbacks. + if (vertex_shader_modification.vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip || + vertex_shader_modification.vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) { + assert_always(); + return false; + } GeometryShaderKey key; key.type = geometry_shader_type; // TODO(Triang3l): Once all needed inputs and outputs are added, uncomment the @@ -840,7 +856,8 @@ bool VulkanPipelineCache::GetGeometryShaderKey( /* vertex_shader_modification.vertex.user_clip_plane_cull */ 0; key.has_vertex_kill_and = /* vertex_shader_modification.vertex.vertex_kill_and */ 0; - key.has_point_size = vertex_shader_modification.vertex.output_point_size; + key.has_point_size = + vertex_shader_modification.vertex.output_point_parameters; key.has_point_coordinates = pixel_shader_modification.pixel.param_gen_point; key_out = key; return true; diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h index 6e0c73ab0..56346d1bc 100644 --- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h +++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.h @@ -71,7 +71,7 @@ class VulkanPipelineCache { SpirvShaderTranslator::Modification GetCurrentVertexShaderModification( const Shader& shader, Shader::HostVertexShaderType host_vertex_shader_type, - uint32_t interpolator_mask) const; + uint32_t interpolator_mask, bool ps_param_gen_used) const; SpirvShaderTranslator::Modification GetCurrentPixelShaderModification( const Shader& shader, uint32_t interpolator_mask, uint32_t param_gen_pos) const; diff --git a/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc b/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc index b7f37f4b9..86b13b4ae 100644 --- a/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_primitive_processor.cc @@ -36,7 +36,9 @@ bool VulkanPrimitiveProcessor::Initialize() { if (!InitializeCommon(device_features.fullDrawIndexUint32, !device_portability_subset_features || device_portability_subset_features->triangleFans, - false, device_features.geometryShader)) { + false, device_features.geometryShader, + device_features.geometryShader, + device_features.geometryShader)) { Shutdown(); return false; } @@ -127,9 +129,9 @@ void VulkanPrimitiveProcessor::EndFrame() { frame_index_buffers_.clear(); } -bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( - uint32_t index_count, std::function fill_callback) { - assert_not_zero(index_count); +bool VulkanPrimitiveProcessor::InitializeBuiltinIndexBuffer( + size_t size_bytes, std::function fill_callback) { + assert_not_zero(size_bytes); assert_true(builtin_index_buffer_ == VK_NULL_HANDLE); assert_true(builtin_index_buffer_memory_ == VK_NULL_HANDLE); assert_true(builtin_index_buffer_upload_ == VK_NULL_HANDLE); @@ -140,7 +142,7 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); VkDevice device = provider.device(); - builtin_index_buffer_size_ = VkDeviceSize(sizeof(uint16_t) * index_count); + builtin_index_buffer_size_ = VkDeviceSize(size_bytes); if (!ui::vulkan::util::CreateDedicatedAllocationBuffer( provider, builtin_index_buffer_size_, VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT, @@ -148,8 +150,8 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( builtin_index_buffer_memory_)) { XELOGE( "Vulkan primitive processor: Failed to create the built-in index " - "buffer GPU resource with {} 16-bit indices", - index_count); + "buffer GPU resource with {} bytes", + size_bytes); return false; } uint32_t upload_memory_type; @@ -161,8 +163,8 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( &upload_memory_type)) { XELOGE( "Vulkan primitive processor: Failed to create the built-in index " - "buffer upload resource with {} 16-bit indices", - index_count); + "buffer upload resource with {} bytes", + size_bytes); ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyBuffer, device, builtin_index_buffer_); ui::vulkan::util::DestroyAndNullHandle(dfn.vkFreeMemory, device, @@ -175,8 +177,8 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( VK_WHOLE_SIZE, 0, &mapping) != VK_SUCCESS) { XELOGE( "Vulkan primitive processor: Failed to map the built-in index buffer " - "upload resource with {} 16-bit indices", - index_count); + "upload resource with {} bytes", + size_bytes); ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyBuffer, device, builtin_index_buffer_upload_); ui::vulkan::util::DestroyAndNullHandle(dfn.vkFreeMemory, device, @@ -187,7 +189,7 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer( builtin_index_buffer_memory_); return false; } - fill_callback(reinterpret_cast(mapping)); + fill_callback(mapping); ui::vulkan::util::FlushMappedMemoryRange( provider, builtin_index_buffer_memory_, upload_memory_type); dfn.vkUnmapMemory(device, builtin_index_buffer_upload_memory_); diff --git a/src/xenia/gpu/vulkan/vulkan_primitive_processor.h b/src/xenia/gpu/vulkan/vulkan_primitive_processor.h index 50e729577..ea8ed4fed 100644 --- a/src/xenia/gpu/vulkan/vulkan_primitive_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_primitive_processor.h @@ -56,9 +56,8 @@ class VulkanPrimitiveProcessor final : public PrimitiveProcessor { } protected: - bool InitializeBuiltin16BitIndexBuffer( - uint32_t index_count, - std::function fill_callback) override; + bool InitializeBuiltinIndexBuffer( + size_t size_bytes, std::function fill_callback) override; void* RequestHostConvertedIndexBufferForCurrentFrame( xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,