[Vulkan] Non-GS point sprites + minor SPIR-V fixes

This commit is contained in:
Triang3l 2022-07-27 17:14:28 +03:00
parent ff7ef05063
commit 7595cdb52b
14 changed files with 721 additions and 274 deletions

View File

@ -2268,7 +2268,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
UpdateSystemConstantValues(
memexport_used, primitive_polygonal,
primitive_processing_result.line_loop_closing_index,
primitive_processing_result.host_index_endian, viewport_info,
primitive_processing_result.host_shader_index_endian, viewport_info,
used_texture_mask, normalized_depth_control, normalized_color_mask);
// Update constant buffers, descriptors and root parameters.
@ -2513,7 +2513,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
}
ID3D12Resource* scratch_index_buffer = nullptr;
switch (primitive_processing_result.index_buffer_type) {
case PrimitiveProcessor::ProcessedIndexBufferType::kGuest: {
case PrimitiveProcessor::ProcessedIndexBufferType::kGuestDMA: {
if (memexport_used) {
// If the shared memory is a UAV, it can't be used as an index buffer
// (UAV is a read/write state, index buffer is a read-only state).
@ -2545,7 +2545,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
primitive_processor_->GetConvertedIndexBufferGpuAddress(
primitive_processing_result.host_index_buffer_handle);
break;
case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltin:
case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForAuto:
case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForDMA:
index_buffer_view.BufferLocation =
primitive_processor_->GetBuiltinIndexBufferGpuAddress(
primitive_processing_result.host_index_buffer_handle);

View File

@ -28,7 +28,7 @@ namespace d3d12 {
D3D12PrimitiveProcessor::~D3D12PrimitiveProcessor() { Shutdown(true); }
bool D3D12PrimitiveProcessor::Initialize() {
if (!InitializeCommon(true, false, false, true)) {
if (!InitializeCommon(true, false, false, true, true, true)) {
Shutdown();
return false;
}
@ -83,9 +83,9 @@ void D3D12PrimitiveProcessor::EndFrame() {
frame_index_buffers_.clear();
}
bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
uint32_t index_count, std::function<void(uint16_t*)> fill_callback) {
assert_not_zero(index_count);
bool D3D12PrimitiveProcessor::InitializeBuiltinIndexBuffer(
size_t size_bytes, std::function<void(void*)> fill_callback) {
assert_not_zero(size_bytes);
assert_null(builtin_index_buffer_);
assert_null(builtin_index_buffer_upload_);
@ -94,8 +94,7 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
ID3D12Device* device = provider.GetDevice();
D3D12_RESOURCE_DESC resource_desc;
ui::d3d12::util::FillBufferResourceDesc(
resource_desc, UINT64(sizeof(uint16_t) * index_count),
ui::d3d12::util::FillBufferResourceDesc(resource_desc, UINT64(size_bytes),
D3D12_RESOURCE_FLAG_NONE);
Microsoft::WRL::ComPtr<ID3D12Resource> draw_resource;
if (FAILED(device->CreateCommittedResource(
@ -105,8 +104,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
IID_PPV_ARGS(&draw_resource)))) {
XELOGE(
"D3D12 primitive processor: Failed to create the built-in index "
"buffer GPU resource with {} 16-bit indices",
index_count);
"buffer GPU resource with {} bytes",
size_bytes);
return false;
}
Microsoft::WRL::ComPtr<ID3D12Resource> upload_resource;
@ -117,8 +116,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
IID_PPV_ARGS(&upload_resource)))) {
XELOGE(
"D3D12 primitive processor: Failed to create the built-in index "
"buffer upload resource with {} 16-bit indices",
index_count);
"buffer upload resource with {} bytes",
size_bytes);
return false;
}
@ -127,8 +126,8 @@ bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
if (FAILED(upload_resource->Map(0, &upload_read_range, &mapping))) {
XELOGE(
"D3D12 primitive processor: Failed to map the built-in index buffer "
"upload resource with {} 16-bit indices",
index_count);
"upload resource with {} bytes",
size_bytes);
return false;
}
fill_callback(reinterpret_cast<uint16_t*>(mapping));

View File

@ -56,9 +56,8 @@ class D3D12PrimitiveProcessor final : public PrimitiveProcessor {
}
protected:
bool InitializeBuiltin16BitIndexBuffer(
uint32_t index_count,
std::function<void(uint16_t*)> fill_callback) override;
bool InitializeBuiltinIndexBuffer(
size_t size_bytes, std::function<void(void*)> fill_callback) override;
void* RequestHostConvertedIndexBufferForCurrentFrame(
xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,

View File

@ -964,8 +964,6 @@ void DxbcShaderTranslator::CompleteVertexOrDomainShader() {
// Check if the shader returns XY/W rather than XY, and if it does, revert
// that.
// TODO(Triang3l): Check if having XY or Z pre-divided by W should result in
// affine interpolation.
a_.OpAnd(temp_x_dest, flags_src, dxbc::Src::LU(kSysFlag_XYDividedByW));
a_.OpIf(true, temp_x_src);
a_.OpMul(dxbc::Dest::R(system_temp_position_, 0b0011),
@ -974,8 +972,6 @@ void DxbcShaderTranslator::CompleteVertexOrDomainShader() {
a_.OpEndIf();
// Check if the shader returns Z/W rather than Z, and if it does, revert that.
// TODO(Triang3l): Check if having XY or Z pre-divided by W should result in
// affine interpolation.
a_.OpAnd(temp_x_dest, flags_src, dxbc::Src::LU(kSysFlag_ZDividedByW));
a_.OpIf(true, temp_x_src);
a_.OpMul(dxbc::Dest::R(system_temp_position_, 0b0100),

View File

@ -9,6 +9,7 @@
#include "xenia/gpu/primitive_processor.h"
#include <algorithm>
#include <cstring>
#include <functional>
#include <utility>
@ -106,7 +107,9 @@ PrimitiveProcessor::~PrimitiveProcessor() { ShutdownCommon(); }
bool PrimitiveProcessor::InitializeCommon(
bool full_32bit_vertex_indices_supported, bool triangle_fans_supported,
bool line_loops_supported, bool quad_lists_supported) {
bool line_loops_supported, bool quad_lists_supported,
bool point_sprites_supported_without_vs_expansion,
bool rectangle_lists_supported_without_vs_expansion) {
full_32bit_vertex_indices_used_ = full_32bit_vertex_indices_supported;
convert_triangle_fans_to_lists_ =
!triangle_fans_supported || cvars::force_convert_triangle_fans_to_lists;
@ -115,32 +118,93 @@ bool PrimitiveProcessor::InitializeCommon(
convert_quad_lists_to_triangle_lists_ =
!quad_lists_supported ||
cvars::force_convert_quad_lists_to_triangle_lists;
// No override cvars as hosts are not required to support the fallback paths
// since they require different vertex shader structure (for the fallback
// HostVertexShaderTypes).
expand_point_sprites_in_vs_ = !point_sprites_supported_without_vs_expansion;
expand_rectangle_lists_in_vs_ =
!rectangle_lists_supported_without_vs_expansion;
// Initialize the index buffer for conversion of auto-indexed primitive types.
uint32_t builtin_index_count = 0;
size_t builtin_index_buffer_size = 0;
// 32-bit, before 16-bit due to alignment (for primitive expansion - when the
// indices encode not only the guest vertex index, but also a part needed for
// host expansion, thus may contain values above UINT16_MAX, such as up to
// (UINT16_MAX - 1) * 4 + 3 for point sprites).
// Using an index buffer for point sprite and rectangle list expansion instead
// of instancing as how instancing is implemented may vary wildly between
// GPUs, potentially slowly (like no different instances in the same
// wavefront) with small vertex counts per instance. Also using triangle
// strips with primitive restart, not triangle lists, so the vertex shader may
// be invoked once for the inner edge vertices, which is important for memory
// export in guest shaders, not to write to the same location from two
// invocations.
uint32_t builtin_ib_two_triangle_strip_count = 0;
if (expand_point_sprites_in_vs_) {
builtin_ib_two_triangle_strip_count =
std::max(uint32_t(UINT16_MAX), builtin_ib_two_triangle_strip_count);
}
if (expand_rectangle_lists_in_vs_) {
builtin_ib_two_triangle_strip_count =
std::max(uint32_t(UINT16_MAX / 3), builtin_ib_two_triangle_strip_count);
}
if (builtin_ib_two_triangle_strip_count) {
builtin_ib_offset_two_triangle_strips_ = builtin_index_buffer_size;
builtin_index_buffer_size +=
sizeof(uint32_t) *
GetTwoTriangleStripIndexCount(builtin_ib_two_triangle_strip_count);
} else {
builtin_ib_offset_two_triangle_strips_ = SIZE_MAX;
}
// 16-bit (for indirection on top of single auto-indexed vertices) - enough
// even if the backend has primitive reset enabled all the time (Metal) as
// auto-indexed draws are limited to UINT16_MAX vertices, not UINT16_MAX + 1.
if (convert_triangle_fans_to_lists_) {
builtin_ib_offset_triangle_fans_to_lists_ =
sizeof(uint16_t) * builtin_index_count;
builtin_index_count += GetTriangleFanListIndexCount(UINT16_MAX);
builtin_ib_offset_triangle_fans_to_lists_ = builtin_index_buffer_size;
builtin_index_buffer_size +=
sizeof(uint16_t) * GetTriangleFanListIndexCount(UINT16_MAX);
} else {
builtin_ib_offset_triangle_fans_to_lists_ = SIZE_MAX;
}
if (convert_quad_lists_to_triangle_lists_) {
builtin_ib_offset_quad_lists_to_triangle_lists_ =
sizeof(uint16_t) * builtin_index_count;
builtin_index_count += GetQuadListTriangleListIndexCount(UINT16_MAX);
builtin_ib_offset_quad_lists_to_triangle_lists_ = builtin_index_buffer_size;
builtin_index_buffer_size +=
sizeof(uint16_t) * GetQuadListTriangleListIndexCount(UINT16_MAX);
} else {
builtin_ib_offset_quad_lists_to_triangle_lists_ = SIZE_MAX;
}
if (builtin_index_count) {
if (!InitializeBuiltin16BitIndexBuffer(
builtin_index_count, [this](uint16_t* mapping) {
if (builtin_index_buffer_size) {
if (!InitializeBuiltinIndexBuffer(
builtin_index_buffer_size,
[this, builtin_ib_two_triangle_strip_count](void* mapping) {
uint32_t* mapping_32bit = reinterpret_cast<uint32_t*>(mapping);
if (builtin_ib_offset_two_triangle_strips_ != SIZE_MAX) {
// Two-triangle strips.
uint32_t* two_triangle_strip_ptr =
mapping_32bit +
builtin_ib_offset_two_triangle_strips_ / sizeof(uint32_t);
for (uint32_t i = 0; i < builtin_ib_two_triangle_strip_count;
++i) {
if (i) {
// Primitive restart.
*(two_triangle_strip_ptr++) = UINT32_MAX;
}
// Host vertex index within the pair in the lower 2 bits,
// guest primitive index in the rest.
uint32_t two_triangle_strip_first_index = i << 2;
for (uint32_t j = 0; j < 4; ++j) {
*(two_triangle_strip_ptr++) =
two_triangle_strip_first_index + j;
}
}
}
uint16_t* mapping_16bit = reinterpret_cast<uint16_t*>(mapping);
if (builtin_ib_offset_triangle_fans_to_lists_ != SIZE_MAX) {
// Triangle fans as triangle lists.
// Ordered as (v1, v2, v0), (v2, v3, v0) in Direct3D.
// https://docs.microsoft.com/en-us/windows/desktop/direct3d9/triangle-fans
uint16_t* triangle_list_ptr =
mapping + builtin_ib_offset_triangle_fans_to_lists_ /
mapping_16bit + builtin_ib_offset_triangle_fans_to_lists_ /
sizeof(uint16_t);
for (uint32_t i = 2; i < UINT16_MAX; ++i) {
*(triangle_list_ptr++) = uint16_t(i - 1);
@ -150,7 +214,8 @@ bool PrimitiveProcessor::InitializeCommon(
}
if (builtin_ib_offset_quad_lists_to_triangle_lists_ != SIZE_MAX) {
uint16_t* triangle_list_ptr =
mapping + builtin_ib_offset_quad_lists_to_triangle_lists_ /
mapping_16bit +
builtin_ib_offset_quad_lists_to_triangle_lists_ /
sizeof(uint16_t);
// TODO(Triang3l): SIMD for faster initialization?
for (uint32_t i = 0; i < UINT16_MAX / 4; ++i) {
@ -309,15 +374,27 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
return false;
}
} else {
host_vertex_shader_type = Shader::HostVertexShaderType::kVertex;
switch (guest_primitive_type) {
case xenos::PrimitiveType::kPointList:
if (expand_point_sprites_in_vs_) {
host_primitive_type = xenos::PrimitiveType::kTriangleStrip;
host_vertex_shader_type =
Shader::HostVertexShaderType::kPointListAsTriangleStrip;
}
break;
case xenos::PrimitiveType::kLineList:
case xenos::PrimitiveType::kLineStrip:
case xenos::PrimitiveType::kTriangleList:
case xenos::PrimitiveType::kTriangleStrip:
// Supported natively on all backends.
break;
case xenos::PrimitiveType::kRectangleList:
// Supported natively or through geometry or compute shaders on all
// backends.
if (expand_rectangle_lists_in_vs_) {
host_primitive_type = xenos::PrimitiveType::kTriangleStrip;
host_vertex_shader_type =
Shader::HostVertexShaderType::kRectangleListAsTriangleStrip;
}
break;
case xenos::PrimitiveType::kTriangleFan:
if (convert_triangle_fans_to_lists_) {
@ -342,7 +419,6 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
assert_always();
return false;
}
host_vertex_shader_type = Shader::HostVertexShaderType::kVertex;
}
// Process the indices.
@ -359,12 +435,86 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
guest_draw_vertex_count = vgt_dma_size.num_words;
}
uint32_t line_loop_closing_index = 0;
uint32_t guest_index_base;
uint32_t guest_index_base = 0, guest_index_buffer_needed_bytes = 0;
CachedResult cacheable;
cacheable.host_draw_vertex_count = guest_draw_vertex_count;
cacheable.host_primitive_reset_enabled = false;
cacheable.host_index_buffer_handle = SIZE_MAX;
if (host_vertex_shader_type ==
Shader::HostVertexShaderType::kPointListAsTriangleStrip ||
host_vertex_shader_type ==
Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) {
// As two-triangle strips, with guest indices being either autogenerated or
// fetched via DMA.
uint32_t primitive_count = guest_draw_vertex_count;
if (host_vertex_shader_type ==
Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) {
primitive_count /= 3;
}
cacheable.host_draw_vertex_count =
GetTwoTriangleStripIndexCount(primitive_count);
cacheable.host_index_format = xenos::IndexFormat::kInt32;
cacheable.host_primitive_reset_enabled = true;
assert_true(builtin_ib_offset_two_triangle_strips_ != SIZE_MAX);
cacheable.host_index_buffer_handle = builtin_ib_offset_two_triangle_strips_;
if (vgt_draw_initiator.source_select == xenos::SourceSelect::kAutoIndex) {
cacheable.index_buffer_type =
ProcessedIndexBufferType::kHostBuiltinForAuto;
cacheable.host_shader_index_endian = xenos::Endian::kNone;
} else {
// There is an index buffer.
assert_true(vgt_draw_initiator.source_select ==
xenos::SourceSelect::kDMA);
if (vgt_draw_initiator.source_select != xenos::SourceSelect::kDMA) {
// TODO(Triang3l): Support immediate-indexed vertices.
XELOGE(
"Primitive processor: Unsupported vertex index source {}. Report "
"the game to Xenia developers!",
uint32_t(vgt_draw_initiator.source_select));
return false;
}
xenos::IndexFormat guest_index_format = vgt_draw_initiator.index_size;
// Normalize the endian.
cacheable.index_buffer_type =
ProcessedIndexBufferType::kHostBuiltinForDMA;
xenos::Endian guest_index_endian = vgt_dma_size.swap_mode;
if (guest_index_format == xenos::IndexFormat::kInt16 &&
(guest_index_endian != xenos::Endian::kNone &&
guest_index_endian != xenos::Endian::k8in16)) {
XELOGW(
"Primitive processor: 32-bit endian swap mode {} is used for "
"16-bit indices. This shouldn't normally be happening, but report "
"the game to Xenia developers for investigation of the intended "
"behavior (ignore or actually swap across adjacent indices)! "
"Currently disabling the swap for 16-and-32 and replacing 8-in-32 "
"with 8-in-16.",
uint32_t(guest_index_endian));
guest_index_endian = guest_index_endian == xenos::Endian::k8in32
? xenos::Endian::k8in16
: xenos::Endian::kNone;
}
cacheable.host_shader_index_endian = guest_index_endian;
// Get the index buffer memory range.
uint32_t index_size_log2 =
guest_index_format == xenos::IndexFormat::kInt16 ? 1 : 2;
// The base should already be aligned, but aligning here too for safety.
guest_index_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32 &
~uint32_t((1 << index_size_log2) - 1);
guest_index_buffer_needed_bytes = guest_draw_vertex_count
<< index_size_log2;
if (guest_index_base > SharedMemory::kBufferSize ||
SharedMemory::kBufferSize - guest_index_base <
guest_index_buffer_needed_bytes) {
XELOGE(
"Primitive processor: Index buffer at 0x{:08X}, 0x{:X} bytes "
"required, is out of the physical memory bounds",
guest_index_base, guest_index_buffer_needed_bytes);
assert_always();
return false;
}
}
} else if (vgt_draw_initiator.source_select ==
xenos::SourceSelect::kAutoIndex) {
// Auto-indexed - use a remapping index buffer if needed to change the
// primitive type.
if (tessellation_enabled &&
@ -376,9 +526,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
assert_always();
return false;
}
guest_index_base = 0;
cacheable.host_index_format = xenos::IndexFormat::kInt16;
cacheable.host_index_endian = xenos::Endian::kNone;
cacheable.host_shader_index_endian = xenos::Endian::kNone;
cacheable.host_primitive_reset_enabled = false;
cacheable.index_buffer_type = ProcessedIndexBufferType::kNone;
if (host_primitive_type != guest_primitive_type) {
@ -388,7 +537,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
xenos::PrimitiveType::kTriangleList);
cacheable.host_draw_vertex_count =
GetTriangleFanListIndexCount(cacheable.host_draw_vertex_count);
cacheable.index_buffer_type = ProcessedIndexBufferType::kHostBuiltin;
cacheable.index_buffer_type =
ProcessedIndexBufferType::kHostBuiltinForAuto;
assert_true(builtin_ib_offset_triangle_fans_to_lists_ != SIZE_MAX);
cacheable.host_index_buffer_handle =
builtin_ib_offset_triangle_fans_to_lists_;
@ -409,7 +559,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
xenos::PrimitiveType::kTriangleList);
cacheable.host_draw_vertex_count = GetQuadListTriangleListIndexCount(
cacheable.host_draw_vertex_count);
cacheable.index_buffer_type = ProcessedIndexBufferType::kHostBuiltin;
cacheable.index_buffer_type =
ProcessedIndexBufferType::kHostBuiltinForAuto;
assert_true(builtin_ib_offset_quad_lists_to_triangle_lists_ !=
SIZE_MAX);
cacheable.host_index_buffer_handle =
@ -503,7 +654,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
// The base should already be aligned, but aligning here too for safety.
guest_index_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32 &
~uint32_t((1 << index_size_log2) - 1);
uint32_t guest_index_buffer_needed_bytes = guest_draw_vertex_count
guest_index_buffer_needed_bytes = guest_draw_vertex_count
<< index_size_log2;
if (guest_index_base > SharedMemory::kBufferSize ||
SharedMemory::kBufferSize - guest_index_base <
@ -517,7 +668,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
}
cacheable.host_index_format = guest_index_format;
cacheable.host_index_endian = guest_index_endian;
cacheable.host_shader_index_endian = guest_index_endian;
uint32_t guest_index_mask_guest_endian =
guest_index_format == xenos::IndexFormat::kInt16
? UINT16_MAX
@ -666,7 +817,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
assert_unhandled_case(guest_index_endian);
return false;
}
cacheable.host_index_endian = xenos::Endian::kNone;
cacheable.host_shader_index_endian = xenos::Endian::kNone;
}
}
cache_transaction.SetNewResult(cacheable);
@ -677,7 +828,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
// endian-swap, or even to safely drop the upper 8 bits if no swap is even
// needed) indirectly.
cacheable.host_draw_vertex_count = guest_draw_vertex_count;
cacheable.index_buffer_type = ProcessedIndexBufferType::kGuest;
cacheable.index_buffer_type = ProcessedIndexBufferType::kGuestDMA;
cacheable.host_primitive_reset_enabled = guest_primitive_reset_enabled;
if (guest_primitive_reset_enabled) {
if (guest_index_format == xenos::IndexFormat::kInt16) {
@ -742,8 +893,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
} else {
// Low 24 bits of the guest index are compared to the primitive reset
// index. If the backend doesn't support full 32-bit indices, for
// ProcessedIndexBufferType::kGuest, the host needs to read the buffer
// indirectly in the vertex shaders and swap, and for
// ProcessedIndexBufferType::kGuestDMA, the host needs to read the
// buffer indirectly in the vertex shaders and swap, and for
// ProcessedIndexBufferType::kHostConverted (if primitive reset is
// actually used, thus exactly 0xFFFFFFFF must be sent to the host for
// it in a true index buffer), no indirection is done, but
@ -800,28 +951,33 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
assert_unhandled_case(guest_index_endian);
return false;
}
cacheable.host_index_endian = full_32bit_vertex_indices_used_
? guest_index_endian
cacheable.host_shader_index_endian =
full_32bit_vertex_indices_used_ ? guest_index_endian
: xenos::Endian::kNone;
}
cache_transaction.SetNewResult(cacheable);
}
}
}
if (cacheable.index_buffer_type == ProcessedIndexBufferType::kGuest) {
}
}
// Request the indices in the shared memory if they need to be accessed from
// there on the GPU.
if (cacheable.index_buffer_type == ProcessedIndexBufferType::kGuestDMA ||
cacheable.index_buffer_type ==
ProcessedIndexBufferType::kHostBuiltinForDMA) {
// Request the index buffer memory.
// TODO(Triang3l): Shared memory request cache.
if (!shared_memory_.RequestRange(guest_index_base,
guest_index_buffer_needed_bytes)) {
XELOGE(
"PrimitiveProcessor: Failed to request index buffer 0x{:08X}, "
"0x{:X} bytes needed, in the shared memory",
"PrimitiveProcessor: Failed to request index buffer 0x{:08X}, 0x{:X} "
"bytes needed, in the shared memory",
guest_index_base, guest_index_buffer_needed_bytes);
return false;
}
}
}
}
result_out.guest_primitive_type = guest_primitive_type;
result_out.host_primitive_type = host_primitive_type;
@ -832,7 +988,7 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
result_out.index_buffer_type = cacheable.index_buffer_type;
result_out.guest_index_base = guest_index_base;
result_out.host_index_format = cacheable.host_index_format;
result_out.host_index_endian = cacheable.host_index_endian;
result_out.host_shader_index_endian = cacheable.host_shader_index_endian;
result_out.host_primitive_reset_enabled =
cacheable.host_primitive_reset_enabled;
result_out.host_index_buffer_handle = cacheable.host_index_buffer_handle;

View File

@ -10,6 +10,7 @@
#ifndef XENIA_GPU_PRIMITIVE_PROCESSOR_H_
#define XENIA_GPU_PRIMITIVE_PROCESSOR_H_
#include <algorithm>
#include <climits>
#include <cstddef>
#include <cstdint>
@ -110,13 +111,16 @@ class PrimitiveProcessor {
// For 32-bit, indirection is needed if the host only supports 24-bit
// indices (even for non-endian-swapped, as the GPU should be ignoring the
// upper 8 bits completely, rather than exhibiting undefined behavior.
kGuest,
kGuestDMA,
// Converted and stored in the primitive converter for the current draw
// command. For 32-bit indices, if the host doesn't support all 32 bits,
// this kind of an index buffer will always be pre-masked and pre-swapped.
kHostConverted,
// Auto-indexed on the guest, but with an adapter index buffer on the host.
kHostBuiltin,
kHostBuiltinForAuto,
// Adapter index buffer on the host for indirect loading of indices via DMA
// (from the shared memory).
kHostBuiltinForDMA,
};
struct ProcessingResult {
@ -136,13 +140,14 @@ class PrimitiveProcessor {
ProcessedIndexBufferType index_buffer_type;
uint32_t guest_index_base;
xenos::IndexFormat host_index_format;
xenos::Endian host_index_endian;
xenos::Endian host_shader_index_endian;
// The reset index, if enabled, is always 0xFFFF for host_index_format
// kInt16 and 0xFFFFFFFF for kInt32. Never enabled for "list" primitive
// types, thus safe for direct usage on Vulkan.
bool host_primitive_reset_enabled;
// Backend-specific handle for the index buffer valid for the current draw,
// only valid for index_buffer_type kHostConverted and kHostBuiltin.
// only valid for index_buffer_type kHostConverted, kHostBuiltinForAuto and
// kHostBuiltinForDMA.
size_t host_index_buffer_handle;
bool IsTessellated() const {
return Shader::IsHostVertexShaderTypeDomain(host_vertex_shader_type);
@ -165,6 +170,12 @@ class PrimitiveProcessor {
bool IsConvertingQuadListsToTriangleLists() const {
return convert_quad_lists_to_triangle_lists_;
}
bool IsExpandingPointSpritesInVS() const {
return expand_point_sprites_in_vs_;
}
bool IsExpandingRectangleListsInVS() const {
return expand_rectangle_lists_in_vs_;
}
// Submission must be open to call (may request the index buffer in the shared
// memory).
@ -217,8 +228,8 @@ class PrimitiveProcessor {
// if indirection may be needed.
// - When full 32-bit indices are not supported, the host must be using
// auto-indexed draws for 32-bit indices of ProcessedIndexBufferType
// kGuest, while fetching the index data manually from the shared memory
// buffer and endian-swapping it.
// kGuestDMA, while fetching the index data manually from the shared
// memory buffer and endian-swapping it.
// - Indirection, however, precludes primitive reset usage - so if
// primitive reset is needed, the primitive processor will pre-swap and
// pre-mask the index buffer so there are only host-endian 0x00###### or
@ -235,19 +246,26 @@ class PrimitiveProcessor {
// those guest primitive types directly or through geometry shader
// emulation. Debug overriding will be resolved in the common code if
// needed.
// - point_sprites_supported_without_vs_expansion,
// rectangle_lists_supported_without_vs_expansion:
// - Pass true or false depending on whether the host actually supports
// those guest primitive types directly or through geometry shader
// emulation. Overrides do not apply to these as hosts are not required to
// support the fallback paths since they require different vertex shader
// structure (for the fallback HostVertexShaderTypes).
bool InitializeCommon(bool full_32bit_vertex_indices_supported,
bool triangle_fans_supported, bool line_loops_supported,
bool quad_lists_supported);
bool quad_lists_supported,
bool point_sprites_supported_without_vs_expansion,
bool rectangle_lists_supported_without_vs_expansion);
// If any primitive type conversion is needed for auto-indexed draws, called
// from InitializeCommon (thus only once in the primitive processor's
// lifetime) to set up the backend's index buffer containing indices for
// primitive type remapping. The backend must allocate a `sizeof(uint16_t) *
// index_count` buffer and call fill_callback for its mapping if creation is
// successful. 16-bit indices are enough even if the backend has primitive
// reset enabled all the time (Metal) as auto-indexed draws are limited to
// UINT16_MAX vertices, not UINT16_MAX + 1.
virtual bool InitializeBuiltin16BitIndexBuffer(
uint32_t index_count, std::function<void(uint16_t*)> fill_callback) = 0;
// primitive type remapping. The backend must allocate a 4-byte-aligned buffer
// with `size_bytes` and call fill_callback for its mapping if creation has
// been successful.
virtual bool InitializeBuiltinIndexBuffer(
size_t size_bytes, std::function<void(void*)> fill_callback) = 0;
// Call last in implementation-specific shutdown, also callable from the
// destructor.
void ShutdownCommon();
@ -509,6 +527,12 @@ class PrimitiveProcessor {
}
};
static constexpr uint32_t GetTwoTriangleStripIndexCount(
uint32_t strip_count) {
// 4 vertices per strip, and primitive restarts between strips.
return 4 * strip_count + (std::max(strip_count, UINT32_C(1)) - 1);
}
// Triangle fan test cases:
// - 4D5307E6 - main menu - game logo, developer logo, backgrounds of the menu
// item list (the whole menu and individual items) - no index buffer.
@ -675,8 +699,11 @@ class PrimitiveProcessor {
bool convert_triangle_fans_to_lists_ = false;
bool convert_line_loops_to_strips_ = false;
bool convert_quad_lists_to_triangle_lists_ = false;
bool expand_point_sprites_in_vs_ = false;
bool expand_rectangle_lists_in_vs_ = false;
// Byte offsets used, for simplicity, directly as handles.
size_t builtin_ib_offset_two_triangle_strips_ = SIZE_MAX;
size_t builtin_ib_offset_triangle_fans_to_lists_ = SIZE_MAX;
size_t builtin_ib_offset_quad_lists_to_triangle_lists_ = SIZE_MAX;
@ -745,7 +772,7 @@ class PrimitiveProcessor {
uint32_t host_draw_vertex_count;
ProcessedIndexBufferType index_buffer_type;
xenos::IndexFormat host_index_format;
xenos::Endian host_index_endian;
xenos::Endian host_shader_index_endian;
bool host_primitive_reset_enabled;
size_t host_index_buffer_handle;
};

View File

@ -111,6 +111,7 @@ void SpirvShaderTranslator::Reset() {
input_front_facing_ = spv::NoResult;
std::fill(input_output_interpolators_.begin(),
input_output_interpolators_.end(), spv::NoResult);
output_point_coordinates_ = spv::NoResult;
output_point_size_ = spv::NoResult;
sampler_bindings_.clear();
@ -1097,10 +1098,24 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderBeforeMain() {
Modification shader_modification = GetSpirvShaderModification();
// Create the point size output. Not using gl_PointSize from gl_PerVertex not
// to rely on the shaderTessellationAndGeometryPointSize feature, and also
// because the value written to gl_PointSize must be greater than zero.
if (shader_modification.vertex.output_point_size) {
if (shader_modification.vertex.output_point_parameters) {
if (shader_modification.vertex.host_vertex_shader_type ==
Shader::HostVertexShaderType::kPointListAsTriangleStrip) {
// Create the point coordinates output.
output_point_coordinates_ =
builder_->createVariable(spv::NoPrecision, spv::StorageClassOutput,
type_float2_, "xe_out_point_coordinates");
builder_->addDecoration(output_point_coordinates_,
spv::DecorationLocation, int(output_location));
builder_->addDecoration(output_point_coordinates_,
spv::DecorationInvariant);
main_interface_.push_back(output_point_coordinates_);
++output_location;
} else {
// Create the point size output. Not using gl_PointSize from gl_PerVertex
// not to rely on the shaderTessellationAndGeometryPointSize feature, and
// also because the value written to gl_PointSize must be greater than
// zero.
output_point_size_ =
builder_->createVariable(spv::NoPrecision, spv::StorageClassOutput,
type_float_, "xe_out_point_size");
@ -1110,6 +1125,7 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderBeforeMain() {
main_interface_.push_back(output_point_size_);
++output_location;
}
}
// Create the gl_PerVertex output for used system outputs.
std::vector<spv::Id> struct_per_vertex_members;
@ -1172,16 +1188,121 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() {
}
}
Modification shader_modification = GetSpirvShaderModification();
// TODO(Triang3l): For HostVertexShaderType::kRectangeListAsTriangleStrip,
// start the vertex loop, and load the index there.
// Load the vertex index or the tessellation parameters.
if (register_count()) {
// TODO(Triang3l): Barycentric coordinates and patch index.
if (IsSpirvVertexShader()) {
// TODO(Triang3l): Close line loop primitive.
// Load the unswapped index as uint for swapping, or for indirect loading
// if needed.
spv::Id vertex_index = builder_->createUnaryOp(
spv::OpBitcast, type_uint_,
builder_->createLoad(input_vertex_index_, spv::NoPrecision));
if (shader_modification.vertex.host_vertex_shader_type ==
Shader::HostVertexShaderType::kPointListAsTriangleStrip) {
// Load the point index, autogenerated or indirectly from the index
// buffer.
// Extract the primitive index from the two-triangle strip vertex index.
spv::Id const_uint_2 = builder_->makeUintConstant(2);
vertex_index = builder_->createBinOp(
spv::OpShiftRightLogical, type_uint_, vertex_index, const_uint_2);
// Check if the index needs to be loaded from the index buffer.
spv::Id load_vertex_index = builder_->createBinOp(
spv::OpINotEqual, type_bool_,
builder_->createBinOp(
spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_,
builder_->makeUintConstant(static_cast<unsigned int>(
kSysFlag_ComputeOrPrimitiveVertexIndexLoad))),
const_uint_0_);
spv::Block& block_load_vertex_index_pre = *builder_->getBuildPoint();
spv::Block& block_load_vertex_index_start = builder_->makeNewBlock();
spv::Block& block_load_vertex_index_merge = builder_->makeNewBlock();
SpirvCreateSelectionMerge(block_load_vertex_index_merge.getId(),
spv::SelectionControlDontFlattenMask);
builder_->createConditionalBranch(load_vertex_index,
&block_load_vertex_index_start,
&block_load_vertex_index_merge);
builder_->setBuildPoint(&block_load_vertex_index_start);
// Check if the index is 32-bit.
spv::Id vertex_index_is_32bit = builder_->createBinOp(
spv::OpINotEqual, type_bool_,
builder_->createBinOp(
spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_,
builder_->makeUintConstant(static_cast<unsigned int>(
kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit))),
const_uint_0_);
// Calculate the vertex index address in the shared memory.
id_vector_temp_.clear();
id_vector_temp_.push_back(
builder_->makeIntConstant(kSystemConstantVertexIndexLoadAddress));
spv::Id vertex_index_address = builder_->createBinOp(
spv::OpIAdd, type_uint_,
builder_->createLoad(
builder_->createAccessChain(spv::StorageClassUniform,
uniform_system_constants_,
id_vector_temp_),
spv::NoPrecision),
builder_->createBinOp(
spv::OpShiftLeftLogical, type_uint_, vertex_index,
builder_->createTriOp(spv::OpSelect, type_uint_,
vertex_index_is_32bit, const_uint_2,
builder_->makeUintConstant(1))));
// Load the 32 bits containing the whole vertex index or two 16-bit
// vertex indices.
// TODO(Triang3l): Bounds checking.
spv::Id loaded_vertex_index =
LoadUint32FromSharedMemory(builder_->createUnaryOp(
spv::OpBitcast, type_int_,
builder_->createBinOp(spv::OpShiftRightLogical, type_uint_,
vertex_index_address, const_uint_2)));
// Extract the 16-bit index from the loaded 32 bits if needed.
loaded_vertex_index = builder_->createTriOp(
spv::OpSelect, type_uint_, vertex_index_is_32bit,
loaded_vertex_index,
builder_->createTriOp(
spv::OpBitFieldUExtract, type_uint_, loaded_vertex_index,
builder_->createBinOp(
spv::OpShiftLeftLogical, type_uint_,
builder_->createBinOp(spv::OpBitwiseAnd, type_uint_,
vertex_index_address, const_uint_2),
builder_->makeUintConstant(4 - 1)),
builder_->makeUintConstant(16)));
// Endian-swap the loaded index.
id_vector_temp_.clear();
id_vector_temp_.push_back(
builder_->makeIntConstant(kSystemConstantVertexIndexEndian));
loaded_vertex_index = EndianSwap32Uint(
loaded_vertex_index,
builder_->createLoad(
builder_->createAccessChain(spv::StorageClassUniform,
uniform_system_constants_,
id_vector_temp_),
spv::NoPrecision));
// Get the actual build point for phi.
spv::Block& block_load_vertex_index_end = *builder_->getBuildPoint();
builder_->createBranch(&block_load_vertex_index_merge);
// Select between the loaded index and the original index from Vulkan.
builder_->setBuildPoint(&block_load_vertex_index_merge);
{
std::unique_ptr<spv::Instruction> loaded_vertex_index_phi_op =
std::make_unique<spv::Instruction>(builder_->getUniqueId(),
type_uint_, spv::OpPhi);
loaded_vertex_index_phi_op->addIdOperand(loaded_vertex_index);
loaded_vertex_index_phi_op->addIdOperand(
block_load_vertex_index_end.getId());
loaded_vertex_index_phi_op->addIdOperand(vertex_index);
loaded_vertex_index_phi_op->addIdOperand(
block_load_vertex_index_pre.getId());
vertex_index = loaded_vertex_index_phi_op->getResultId();
builder_->getBuildPoint()->addInstruction(
std::move(loaded_vertex_index_phi_op));
}
} else {
// TODO(Triang3l): Close line loop primitive.
// Load the unswapped index as uint for swapping, or for indirect
// loading if needed.
if (!features_.full_draw_index_uint32) {
// Check if the full 32-bit index needs to be loaded indirectly.
spv::Id load_vertex_index = builder_->createBinOp(
@ -1239,18 +1360,20 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() {
std::move(loaded_vertex_index_phi_op));
}
}
// Endian-swap the index and convert to int.
// Endian-swap the index.
id_vector_temp_.clear();
id_vector_temp_.push_back(
builder_->makeIntConstant(kSystemConstantVertexIndexEndian));
spv::Id vertex_index_endian =
builder_->createLoad(builder_->createAccessChain(
vertex_index = EndianSwap32Uint(
vertex_index, builder_->createLoad(
builder_->createAccessChain(
spv::StorageClassUniform,
uniform_system_constants_, id_vector_temp_),
spv::NoPrecision);
vertex_index = builder_->createUnaryOp(
spv::OpBitcast, type_int_,
EndianSwap32Uint(vertex_index, vertex_index_endian));
spv::NoPrecision));
}
// Convert the index to a signed integer.
vertex_index =
builder_->createUnaryOp(spv::OpBitcast, type_int_, vertex_index);
// Add the base to the index.
id_vector_temp_.clear();
id_vector_temp_.push_back(
@ -1301,10 +1424,13 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() {
builder_->createTriOp(spv::OpSelect, type_float_, is_w_not_reciprocal,
position_w, guest_position_w_inv);
spv::Id position_xyz;
// Open a scope since position_xy and position_z won't be synchronized anymore
// after position_xyz is built and modified later.
{
// Check if the shader returns XY/W rather than XY, and if it does, revert
// that.
// TODO(Triang3l): Check if having XY or Z pre-divided by W should result in
// affine interpolation.
uint_vector_temp_.clear();
uint_vector_temp_.reserve(2);
uint_vector_temp_.push_back(0);
@ -1322,13 +1448,14 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() {
spv::OpVectorTimesScalar, type_float2_, position_xy, position_w);
builder_->addDecoration(guest_position_xy_mul_w,
spv::DecorationNoContraction);
position_xy =
builder_->createTriOp(spv::OpSelect, type_float2_, is_xy_divided_by_w,
position_xy = builder_->createTriOp(
spv::OpSelect, type_float2_,
builder_->smearScalar(spv::NoPrecision, is_xy_divided_by_w,
type_bool2_),
guest_position_xy_mul_w, position_xy);
// Check if the shader returns Z/W rather than Z, and if it does, revert that.
// TODO(Triang3l): Check if having XY or Z pre-divided by W should result in
// affine interpolation.
// Check if the shader returns Z/W rather than Z, and if it does, revert
// that.
spv::Id position_z =
builder_->createCompositeExtract(guest_position, type_float_, 2);
spv::Id is_z_divided_by_w = builder_->createBinOp(
@ -1340,13 +1467,13 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() {
const_uint_0_);
spv::Id guest_position_z_mul_w =
builder_->createBinOp(spv::OpFMul, type_float_, position_z, position_w);
builder_->addDecoration(guest_position_z_mul_w, spv::DecorationNoContraction);
builder_->addDecoration(guest_position_z_mul_w,
spv::DecorationNoContraction);
position_z =
builder_->createTriOp(spv::OpSelect, type_float_, is_z_divided_by_w,
guest_position_z_mul_w, position_z);
// Build XYZ of the position with W format handled.
spv::Id position_xyz;
{
std::unique_ptr<spv::Instruction> composite_construct_op =
std::make_unique<spv::Instruction>(
@ -1357,6 +1484,7 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() {
builder_->getBuildPoint()->addInstruction(
std::move(composite_construct_op));
}
}
// Apply the NDC scale and offset for guest to host viewport transformation.
id_vector_temp_.clear();
@ -1382,20 +1510,6 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() {
ndc_offset_mul_w);
builder_->addDecoration(position_xyz, spv::DecorationNoContraction);
// Store the position converted to the host.
spv::Id position;
{
std::unique_ptr<spv::Instruction> composite_construct_op =
std::make_unique<spv::Instruction>(
builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct);
composite_construct_op->addIdOperand(position_xyz);
composite_construct_op->addIdOperand(position_w);
position = composite_construct_op->getResultId();
builder_->getBuildPoint()->addInstruction(
std::move(composite_construct_op));
}
builder_->createStore(position, position_ptr);
// Write the point size.
if (output_point_size_ != spv::NoResult) {
spv::Id point_size;
@ -1415,6 +1529,154 @@ void SpirvShaderTranslator::CompleteVertexOrTessEvalShaderInMain() {
}
builder_->createStore(point_size, output_point_size_);
}
Modification shader_modification = GetSpirvShaderModification();
// Expand the point sprite.
if (shader_modification.vertex.host_vertex_shader_type ==
Shader::HostVertexShaderType::kPointListAsTriangleStrip) {
// Top-left, bottom-left, top-right, bottom-right order (chosen arbitrarily,
// simply based on counterclockwise meaning front with
// frontFace = VkFrontFace(0), but faceness is ignored for non-polygon
// primitive types).
id_vector_temp_.clear();
id_vector_temp_.reserve(2);
id_vector_temp_.push_back(builder_->makeUintConstant(0b10));
id_vector_temp_.push_back(builder_->makeUintConstant(0b01));
spv::Id point_vertex_positive = builder_->createBinOp(
spv::OpINotEqual, type_bool2_,
builder_->createBinOp(
spv::OpBitwiseAnd, type_uint2_,
builder_->smearScalar(spv::NoPrecision,
builder_->createUnaryOp(
spv::OpBitcast, type_uint_,
builder_->createLoad(input_vertex_index_,
spv::NoPrecision)),
type_uint2_),
builder_->createCompositeConstruct(type_uint2_, id_vector_temp_)),
SpirvSmearScalarResultOrConstant(const_uint_0_, type_uint2_));
// Load the point diameter in guest pixels, with the override from the
// vertex shader if provided.
id_vector_temp_.clear();
id_vector_temp_.push_back(
builder_->makeIntConstant(kSystemConstantPointConstantDiameter));
spv::Id point_guest_diameter = builder_->createLoad(
builder_->createAccessChain(spv::StorageClassUniform,
uniform_system_constants_, id_vector_temp_),
spv::NoPrecision);
if (current_shader().writes_point_size_edge_flag_kill_vertex() & 0b001) {
assert_true(var_main_point_size_edge_flag_kill_vertex_ != spv::NoResult);
id_vector_temp_.clear();
id_vector_temp_.push_back(const_int_0_);
spv::Id point_vertex_diameter = builder_->createLoad(
builder_->createAccessChain(
spv::StorageClassFunction,
var_main_point_size_edge_flag_kill_vertex_, id_vector_temp_),
spv::NoPrecision);
// The vertex shader's header writes -1.0 to point_size by default, so any
// non-negative value means that it was overwritten by the translated
// vertex shader, and needs to be used instead of the constant size. The
// per-vertex diameter has already been clamped earlier in translation
// (combined with making it non-negative).
point_guest_diameter = builder_->createTriOp(
spv::OpSelect, type_float2_,
builder_->smearScalar(
spv::NoPrecision,
builder_->createBinOp(spv::OpFOrdGreaterThanEqual, type_bool_,
point_vertex_diameter, const_float_0_),
type_bool2_),
builder_->smearScalar(spv::NoPrecision, point_vertex_diameter,
type_float2_),
point_guest_diameter);
}
// Transform the diameter in the guest screen coordinates to radius in the
// normalized device coordinates.
id_vector_temp_.clear();
id_vector_temp_.push_back(builder_->makeIntConstant(
kSystemConstantPointScreenDiameterToNdcRadius));
spv::Id point_radius = builder_->createBinOp(
spv::OpFMul, type_float2_, point_guest_diameter,
builder_->createLoad(builder_->createAccessChain(
spv::StorageClassUniform,
uniform_system_constants_, id_vector_temp_),
spv::NoPrecision));
builder_->addDecoration(point_radius, spv::DecorationNoContraction);
// Transform the radius from the normalized device coordinates to the clip
// space.
point_radius = builder_->createBinOp(spv::OpVectorTimesScalar, type_float2_,
point_radius, position_w);
builder_->addDecoration(point_radius, spv::DecorationNoContraction);
// Apply the direction of expansion for the current host vertex.
spv::Id point_radius_negative =
builder_->createUnaryOp(spv::OpFNegate, type_float2_, point_radius);
builder_->addDecoration(point_radius_negative,
spv::DecorationNoContraction);
// Expand the point sprite.
uint_vector_temp_.clear();
uint_vector_temp_.reserve(2);
uint_vector_temp_.push_back(0);
uint_vector_temp_.push_back(1);
spv::Id point_position_xy = builder_->createBinOp(
spv::OpFAdd, type_float2_,
builder_->createRvalueSwizzle(spv::NoPrecision, type_float2_,
position_xyz, uint_vector_temp_),
builder_->createTriOp(spv::OpSelect, type_float2_,
point_vertex_positive, point_radius,
point_radius_negative));
builder_->addDecoration(point_position_xy, spv::DecorationNoContraction);
// Store the position.
spv::Id position;
{
// Bypass the `getNumTypeConstituents(typeId) == (int)constituents.size()`
// assertion in createCompositeConstruct, OpCompositeConstruct can
// construct vectors not only from scalars, but also from other vectors.
std::unique_ptr<spv::Instruction> composite_construct_op =
std::make_unique<spv::Instruction>(
builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct);
composite_construct_op->addIdOperand(point_position_xy);
composite_construct_op->addIdOperand(
builder_->createCompositeExtract(position_xyz, type_float_, 2));
composite_construct_op->addIdOperand(position_w);
position = composite_construct_op->getResultId();
builder_->getBuildPoint()->addInstruction(
std::move(composite_construct_op));
}
builder_->createStore(position, position_ptr);
// Write the point coordinates.
if (output_point_coordinates_ != spv::NoResult) {
builder_->createStore(
builder_->createTriOp(spv::OpSelect, type_float2_,
point_vertex_positive, const_float2_1_,
const_float2_0_),
output_point_coordinates_);
}
// TODO(Triang3l): For points, handle ps_ucp_mode (take the guest clip space
// coordinates instead of the host ones, calculate the distances to the user
// clip planes, cull using the distance from the center for modes 0, 1 and
// 2, cull and clip per-vertex for modes 2 and 3) in clip and cull
// distances.
} else {
// Store the position converted to the host.
spv::Id position;
{
// Bypass the `getNumTypeConstituents(typeId) == (int)constituents.size()`
// assertion in createCompositeConstruct, OpCompositeConstruct can
// construct vectors not only from scalars, but also from other vectors.
std::unique_ptr<spv::Instruction> composite_construct_op =
std::make_unique<spv::Instruction>(
builder_->getUniqueId(), type_float4_, spv::OpCompositeConstruct);
composite_construct_op->addIdOperand(position_xyz);
composite_construct_op->addIdOperand(position_w);
position = composite_construct_op->getResultId();
builder_->getBuildPoint()->addInstruction(
std::move(composite_construct_op));
}
builder_->createStore(position, position_ptr);
}
}
void SpirvShaderTranslator::StartFragmentShaderBeforeMain() {

View File

@ -50,7 +50,11 @@ class SpirvShaderTranslator : public ShaderTranslator {
// Interpolators written by the vertex shader and needed by the pixel
// shader.
uint32_t interpolator_mask : xenos::kMaxInterpolators;
uint32_t output_point_size : 1;
// For HostVertexShaderType kPointListAsTriangleStrip, whether to output
// the point coordinates.
// For other HostVertexShaderTypes (though truly reachable only for
// kVertex), whether to output the point size.
uint32_t output_point_parameters : 1;
// Dynamically indexable register count from SQ_PROGRAM_CNTL.
uint32_t dynamic_addressable_register_count : 8;
// Pipeline stage and input configuration.
@ -655,6 +659,9 @@ class SpirvShaderTranslator : public ShaderTranslator {
// all).
std::array<spv::Id, xenos::kMaxInterpolators> input_output_interpolators_;
// VS, only for HostVertexShaderType::kPointListAsTriangleStrip when needed
// for the PS - float2.
spv::Id output_point_coordinates_;
// VS, only when needed - float.
spv::Id output_point_size_;

View File

@ -2171,7 +2171,9 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
// TODO(Triang3l): Tessellation, geometry-type-specific vertex shader,
// vertex shader as compute.
if (primitive_processing_result.host_vertex_shader_type !=
Shader::HostVertexShaderType::kVertex) {
Shader::HostVertexShaderType::kVertex &&
primitive_processing_result.host_vertex_shader_type !=
Shader::HostVertexShaderType::kPointListAsTriangleStrip) {
return false;
}
@ -2179,7 +2181,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
vertex_shader_modification =
pipeline_cache_->GetCurrentVertexShaderModification(
*vertex_shader, primitive_processing_result.host_vertex_shader_type,
interpolator_mask);
interpolator_mask, ps_param_gen_pos != UINT32_MAX);
pixel_shader_modification =
pixel_shader ? pipeline_cache_->GetCurrentPixelShaderModification(
*pixel_shader, interpolator_mask, ps_param_gen_pos)
@ -2348,6 +2350,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
}
const ui::vulkan::VulkanProvider& provider = GetVulkanProvider();
const VkPhysicalDeviceFeatures& device_features = provider.device_features();
const VkPhysicalDeviceLimits& device_limits =
provider.device_properties().limits;
@ -2382,11 +2385,23 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
UpdateDynamicState(viewport_info, primitive_polygonal,
normalized_depth_control);
auto vgt_draw_initiator = regs.Get<reg::VGT_DRAW_INITIATOR>();
// Whether to load the guest 32-bit (usually big-endian) vertex index
// indirectly in the vertex shader if full 32-bit indices are not supported by
// the host.
bool shader_32bit_index_dma =
!device_features.fullDrawIndexUint32 &&
primitive_processing_result.index_buffer_type ==
PrimitiveProcessor::ProcessedIndexBufferType::kGuestDMA &&
vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32 &&
primitive_processing_result.host_vertex_shader_type ==
Shader::HostVertexShaderType::kVertex;
// Update system constants before uploading them.
bool vertex_shader_index_load;
UpdateSystemConstantValues(primitive_polygonal, primitive_processing_result,
viewport_info, used_texture_mask,
vertex_shader_index_load);
shader_32bit_index_dma, viewport_info,
used_texture_mask);
// Update uniform buffers and descriptor sets after binding the pipeline with
// the new layout.
@ -2453,13 +2468,13 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
// Draw.
if (primitive_processing_result.index_buffer_type ==
PrimitiveProcessor::ProcessedIndexBufferType::kNone ||
vertex_shader_index_load) {
shader_32bit_index_dma) {
deferred_command_buffer_.CmdVkDraw(
primitive_processing_result.host_draw_vertex_count, 1, 0, 0);
} else {
std::pair<VkBuffer, VkDeviceSize> index_buffer;
switch (primitive_processing_result.index_buffer_type) {
case PrimitiveProcessor::ProcessedIndexBufferType::kGuest:
case PrimitiveProcessor::ProcessedIndexBufferType::kGuestDMA:
index_buffer.first = shared_memory_->buffer();
index_buffer.second = primitive_processing_result.guest_index_base;
break;
@ -2467,7 +2482,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
index_buffer = primitive_processor_->GetConvertedIndexBuffer(
primitive_processing_result.host_index_buffer_handle);
break;
case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltin:
case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForAuto:
case PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForDMA:
index_buffer = primitive_processor_->GetBuiltinIndexBuffer(
primitive_processing_result.host_index_buffer_handle);
break;
@ -3342,8 +3358,8 @@ void VulkanCommandProcessor::UpdateDynamicState(
void VulkanCommandProcessor::UpdateSystemConstantValues(
bool primitive_polygonal,
const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask,
bool& vertex_shader_index_load_out) {
bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info,
uint32_t used_texture_mask) {
#if XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES
SCOPE_profile_cpu_f("gpu");
#endif // XE_UI_VULKAN_FINE_GRAINED_DRAW_SCOPES
@ -3367,51 +3383,17 @@ void VulkanCommandProcessor::UpdateSystemConstantValues(
// Flags.
uint32_t flags = 0;
// Vertex index shader loading.
bool vertex_shader_index_load = false;
// Only for ProcessedIndexBufferType kGuest since kHostConverted indices may
// be not loaded into the GPU memory (only read on the CPU), though
// kHostConverted must never be used for point lists and rectangle lists
// without geometry shaders anyway. For regular 32-bit index fetching without
// fullDrawIndexUint32, kHostConverted indices are already byte-swapped and
// truncated to 24 bits, so indirect fetch is not needed.
if (primitive_processing_result.index_buffer_type ==
PrimitiveProcessor::ProcessedIndexBufferType::kGuest) {
switch (primitive_processing_result.host_vertex_shader_type) {
case Shader::HostVertexShaderType::kVertex: {
// For guest (usually big-endian) 32-bit indices when they're not
// supported by the device.
if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) {
const ui::vulkan::VulkanProvider& provider = GetVulkanProvider();
const VkPhysicalDeviceFeatures& device_features =
provider.device_features();
if (!device_features.fullDrawIndexUint32) {
vertex_shader_index_load = true;
if (shader_32bit_index_dma) {
flags |= SpirvShaderTranslator::kSysFlag_VertexIndexLoad;
}
}
} break;
// kMemexportCompute never comes out of the PrimitiveProcessor, as
// memexport compute shaders are executed alongside their vertex
// counterparts, since they may still result in drawing.
case Shader::HostVertexShaderType::kPointListAsTriangleStrip:
case Shader::HostVertexShaderType::kRectangleListAsTriangleStrip: {
// Always loading the guest index buffer indirectly if it's used, as
// host indexing contains a part needed specifically for the host for
// the construction of the primitive - host vertices don't map 1:1 to
// guest ones.
vertex_shader_index_load = true;
flags |=
SpirvShaderTranslator::kSysFlag_ComputeOrPrimitiveVertexIndexLoad;
if (primitive_processing_result.index_buffer_type ==
PrimitiveProcessor::ProcessedIndexBufferType::kHostBuiltinForDMA) {
flags |= SpirvShaderTranslator::kSysFlag_ComputeOrPrimitiveVertexIndexLoad;
if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) {
flags |= SpirvShaderTranslator ::
kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit;
}
} break;
default:
break;
}
}
vertex_shader_index_load_out = vertex_shader_index_load;
// W0 division control.
// http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
// 8: VTX_XY_FMT = true: the incoming XY have already been multiplied by 1/W0.
@ -3466,9 +3448,9 @@ void VulkanCommandProcessor::UpdateSystemConstantValues(
// Index or tessellation edge factor buffer endianness.
dirty |= system_constants_.vertex_index_endian !=
primitive_processing_result.host_index_endian;
primitive_processing_result.host_shader_index_endian;
system_constants_.vertex_index_endian =
primitive_processing_result.host_index_endian;
primitive_processing_result.host_shader_index_endian;
// Vertex index offset.
dirty |= system_constants_.vertex_base_index != vgt_indx_offset;

View File

@ -436,8 +436,8 @@ class VulkanCommandProcessor : public CommandProcessor {
void UpdateSystemConstantValues(
bool primitive_polygonal,
const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask,
bool& vertex_shader_index_load_out);
bool shader_32bit_index_dma, const draw_util::ViewportInfo& viewport_info,
uint32_t used_texture_mask);
bool UpdateBindings(const VulkanShader* vertex_shader,
const VulkanShader* pixel_shader);
// Allocates a descriptor set and fills one or two VkWriteDescriptorSet

View File

@ -118,7 +118,7 @@ VulkanShader* VulkanPipelineCache::LoadShader(xenos::ShaderType shader_type,
SpirvShaderTranslator::Modification
VulkanPipelineCache::GetCurrentVertexShaderModification(
const Shader& shader, Shader::HostVertexShaderType host_vertex_shader_type,
uint32_t interpolator_mask) const {
uint32_t interpolator_mask, bool ps_param_gen_used) const {
assert_true(shader.type() == xenos::ShaderType::kVertex);
assert_true(shader.is_ucode_analyzed());
const auto& regs = register_file_;
@ -133,10 +133,15 @@ VulkanPipelineCache::GetCurrentVertexShaderModification(
modification.vertex.interpolator_mask = interpolator_mask;
modification.vertex.output_point_size =
if (host_vertex_shader_type ==
Shader::HostVertexShaderType::kPointListAsTriangleStrip) {
modification.vertex.output_point_parameters = uint32_t(ps_param_gen_used);
} else {
modification.vertex.output_point_parameters =
uint32_t((shader.writes_point_size_edge_flag_kill_vertex() & 0b001) &&
regs.Get<reg::VGT_DRAW_INITIATOR>().prim_type ==
xenos::PrimitiveType::kPointList);
}
return modification;
}
@ -828,6 +833,17 @@ bool VulkanPipelineCache::GetGeometryShaderKey(
if (geometry_shader_type == PipelineGeometryShader::kNone) {
return false;
}
// For kPointListAsTriangleStrip, output_point_parameters has a different
// meaning (the coordinates, not the size). However, the AsTriangleStrip host
// vertex shader types are needed specifically when geometry shaders are not
// supported as fallbacks.
if (vertex_shader_modification.vertex.host_vertex_shader_type ==
Shader::HostVertexShaderType::kPointListAsTriangleStrip ||
vertex_shader_modification.vertex.host_vertex_shader_type ==
Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) {
assert_always();
return false;
}
GeometryShaderKey key;
key.type = geometry_shader_type;
// TODO(Triang3l): Once all needed inputs and outputs are added, uncomment the
@ -840,7 +856,8 @@ bool VulkanPipelineCache::GetGeometryShaderKey(
/* vertex_shader_modification.vertex.user_clip_plane_cull */ 0;
key.has_vertex_kill_and =
/* vertex_shader_modification.vertex.vertex_kill_and */ 0;
key.has_point_size = vertex_shader_modification.vertex.output_point_size;
key.has_point_size =
vertex_shader_modification.vertex.output_point_parameters;
key.has_point_coordinates = pixel_shader_modification.pixel.param_gen_point;
key_out = key;
return true;

View File

@ -71,7 +71,7 @@ class VulkanPipelineCache {
SpirvShaderTranslator::Modification GetCurrentVertexShaderModification(
const Shader& shader,
Shader::HostVertexShaderType host_vertex_shader_type,
uint32_t interpolator_mask) const;
uint32_t interpolator_mask, bool ps_param_gen_used) const;
SpirvShaderTranslator::Modification GetCurrentPixelShaderModification(
const Shader& shader, uint32_t interpolator_mask,
uint32_t param_gen_pos) const;

View File

@ -36,7 +36,9 @@ bool VulkanPrimitiveProcessor::Initialize() {
if (!InitializeCommon(device_features.fullDrawIndexUint32,
!device_portability_subset_features ||
device_portability_subset_features->triangleFans,
false, device_features.geometryShader)) {
false, device_features.geometryShader,
device_features.geometryShader,
device_features.geometryShader)) {
Shutdown();
return false;
}
@ -127,9 +129,9 @@ void VulkanPrimitiveProcessor::EndFrame() {
frame_index_buffers_.clear();
}
bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
uint32_t index_count, std::function<void(uint16_t*)> fill_callback) {
assert_not_zero(index_count);
bool VulkanPrimitiveProcessor::InitializeBuiltinIndexBuffer(
size_t size_bytes, std::function<void(void*)> fill_callback) {
assert_not_zero(size_bytes);
assert_true(builtin_index_buffer_ == VK_NULL_HANDLE);
assert_true(builtin_index_buffer_memory_ == VK_NULL_HANDLE);
assert_true(builtin_index_buffer_upload_ == VK_NULL_HANDLE);
@ -140,7 +142,7 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn();
VkDevice device = provider.device();
builtin_index_buffer_size_ = VkDeviceSize(sizeof(uint16_t) * index_count);
builtin_index_buffer_size_ = VkDeviceSize(size_bytes);
if (!ui::vulkan::util::CreateDedicatedAllocationBuffer(
provider, builtin_index_buffer_size_,
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
@ -148,8 +150,8 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
builtin_index_buffer_memory_)) {
XELOGE(
"Vulkan primitive processor: Failed to create the built-in index "
"buffer GPU resource with {} 16-bit indices",
index_count);
"buffer GPU resource with {} bytes",
size_bytes);
return false;
}
uint32_t upload_memory_type;
@ -161,8 +163,8 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
&upload_memory_type)) {
XELOGE(
"Vulkan primitive processor: Failed to create the built-in index "
"buffer upload resource with {} 16-bit indices",
index_count);
"buffer upload resource with {} bytes",
size_bytes);
ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyBuffer, device,
builtin_index_buffer_);
ui::vulkan::util::DestroyAndNullHandle(dfn.vkFreeMemory, device,
@ -175,8 +177,8 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
VK_WHOLE_SIZE, 0, &mapping) != VK_SUCCESS) {
XELOGE(
"Vulkan primitive processor: Failed to map the built-in index buffer "
"upload resource with {} 16-bit indices",
index_count);
"upload resource with {} bytes",
size_bytes);
ui::vulkan::util::DestroyAndNullHandle(dfn.vkDestroyBuffer, device,
builtin_index_buffer_upload_);
ui::vulkan::util::DestroyAndNullHandle(dfn.vkFreeMemory, device,
@ -187,7 +189,7 @@ bool VulkanPrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
builtin_index_buffer_memory_);
return false;
}
fill_callback(reinterpret_cast<uint16_t*>(mapping));
fill_callback(mapping);
ui::vulkan::util::FlushMappedMemoryRange(
provider, builtin_index_buffer_memory_, upload_memory_type);
dfn.vkUnmapMemory(device, builtin_index_buffer_upload_memory_);

View File

@ -56,9 +56,8 @@ class VulkanPrimitiveProcessor final : public PrimitiveProcessor {
}
protected:
bool InitializeBuiltin16BitIndexBuffer(
uint32_t index_count,
std::function<void(uint16_t*)> fill_callback) override;
bool InitializeBuiltinIndexBuffer(
size_t size_bytes, std::function<void(void*)> fill_callback) override;
void* RequestHostConvertedIndexBufferForCurrentFrame(
xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,