xenia-canary/src/xenia/gpu/primitive_processor.h

870 lines
39 KiB
C++

/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_GPU_PRIMITIVE_PROCESSOR_H_
#define XENIA_GPU_PRIMITIVE_PROCESSOR_H_
#include <climits>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <deque>
#include <functional>
#include <mutex>
#include <unordered_map>
#include <utility>
#include "xenia/base/assert.h"
#include "xenia/base/cvar.h"
#include "xenia/base/math.h"
#include "xenia/base/mutex.h"
#include "xenia/base/platform.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/shared_memory.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/xenos.h"
#include "xenia/memory.h"
#if XE_ARCH_AMD64
// 128-bit SSSE3-level (SSE2+ for integer comparison, SSSE3 for pshufb) or AVX
// (256-bit AVX only got integer operations such as comparison in AVX2, which is
// above the minimum requirements of Xenia).
#include <tmmintrin.h>
#define XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE 16
#elif XE_ARCH_ARM64
#include <arm_neon.h>
#define XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE 16
#else
#define XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE 0
#endif // XE_ARCH
// The idea behind this config variable is to force both indirection without
// primitive reset and pre-masking / pre-swapping with primitive reset,
// therefore this is supposed to be checked only by the host if it supports
// indirection. It's pretty pointless to do only half of this on backends that
// support full 32-bit indices unconditionally.
DECLARE_bool(ignore_32bit_vertex_index_support);
namespace xe {
namespace gpu {
// Normalizes primitive data in various ways for use with Direct3D 12 and Vulkan
// (down to its minimum requirements plus the portability subset).
//
// This solves various issues:
// - Triangle fans not supported on Direct3D 10+ and the Vulkan portability
// subset.
// - Converts to triangle lists, both with and without primitive reset.
// - Line loops are not supported on Direct3D 12 or Vulkan.
// - Converts to line strips.
// - Quads not reproducible with line lists with adjacency without geometry
// shaders (some Vulkan implementations), as well as being hard to debug in
// PIX due to "catastrophic failures".
// - Converts to triangle lists.
// - Vulkan requiring 0xFFFF primitive restart index for 16-bit indices and
// 0xFFFFFFFF for 32-bit (Direct3D 12 slightly relaxes this, allowing 0xFFFF
// for 32-bit also, but it's of no use to Xenia since guest indices are
// big-endian usually. Also, only 24 lower bits of the vertex index being used
// on the guest (tested on an Adreno 200 phone with drawing, though not with
// primitive restart as OpenGL ES 2.0 doesn't expose it), so the upper 8 bits
// likely shouldn't have effect on primitive restart (guest reset index
// 0xFFFFFF likely working for 0xFFFFFF, 0xFFFFFFFF, and 254 more indices),
// while Vulkan and Direct3D 12 require exactly 0xFFFFFFFF.
// - For 16-bit indices with guest reset index other than 0xFFFF (passing
// 0xFFFF directly to the host is fine because it's the same irrespective of
// endianness), there are two possible solutions:
// - If the index buffer otherwise doesn't contain 0xFFFF otherwise (since
// it's a valid vertex index in this case), replacing the primitive reset
// index with 0xFFFF in the 16-bit buffer.
// - If the index buffer contains any usage of 0xFFFF as a real vertex
// index, converting the index buffer to 32-bit, and replacing the
// primitive reset index with 0xFFFFFFFF.
// - For 32-bit indices, there are two paths:
// - If the guest reset index is 0xFFFFFF, and the index buffer actually
// uses only 0xFFFFFFFF for reset, using it without changes.
// - If the guest uses something other than 0xFFFFFFFF for primitive reset,
// replacing elements with (index & 0xFFFFFF) == reset_index with
// 0xFFFFFFFF.
// - Some Vulkan implementations only support 24-bit indices. The guests usually
// pass big-endian vertices, so we need all 32 bits (as the least significant
// bits will be in 24...31) to perform the byte swapping. For this reason, we
// load 32-bit indices indirectly, doing non-indexed draws and fetching the
// indices from the shared memory. This, however, is not compatible with
// primitive restart.
// - Pre-swapping, masking to 24 bits, and converting the reset index to
// 0xFFFFFFFF, resulting in an index buffer that can be used directly.
class PrimitiveProcessor {
public:
enum ProcessedIndexBufferType {
// Auto-indexed on the host.
kNone,
// GPU DMA, from the shared memory.
// For 32-bit, indirection is needed if the host only supports 24-bit
// indices (even for non-endian-swapped, as the GPU should be ignoring the
// upper 8 bits completely, rather than exhibiting undefined behavior.
kGuest,
// Converted and stored in the primitive converter for the current draw
// command. For 32-bit indices, if the host doesn't support all 32 bits,
// this kind of an index buffer will always be pre-masked and pre-swapped.
kHostConverted,
// Auto-indexed on the guest, but with an adapter index buffer on the host.
kHostBuiltin,
};
struct ProcessingResult {
xenos::PrimitiveType guest_primitive_type;
xenos::PrimitiveType host_primitive_type;
// Includes whether tessellation is enabled (not kVertex) and the type of
// tessellation.
Shader::HostVertexShaderType host_vertex_shader_type;
// Only used for non-kVertex host_vertex_shader_type. For kAdaptive, the
// index buffer is always from the guest and fully 32-bit, and contains the
// floating-point tessellation factors.
xenos::TessellationMode tessellation_mode;
// TODO(Triang3l): If important, split into the index count and the actual
// index buffer size, using zeros for out-of-bounds indices.
uint32_t host_draw_vertex_count;
uint32_t line_loop_closing_index;
ProcessedIndexBufferType index_buffer_type;
uint32_t guest_index_base;
xenos::IndexFormat host_index_format;
xenos::Endian host_index_endian;
// The reset index, if enabled, is always 0xFFFF for host_index_format
// kInt16 and 0xFFFFFFFF for kInt32.
bool host_primitive_reset_enabled;
// Backend-specific handle for the index buffer valid for the current draw,
// only valid for index_buffer_type kHostConverted and kHostBuiltin.
size_t host_index_buffer_handle;
bool IsTessellated() const {
return host_vertex_shader_type != Shader::HostVertexShaderType::kVertex;
}
};
virtual ~PrimitiveProcessor();
bool AreFull32BitVertexIndicesUsed() const {
return full_32bit_vertex_indices_used_;
}
bool IsConvertingTriangleFansToLists() const {
return convert_triangle_fans_to_lists_;
}
bool IsConvertingLineLoopsToStrips() const {
return convert_line_loops_to_strips_;
}
// Quad lists may be emulated as line lists with adjacency and a geometry
// shader, but geometry shaders must be supported for this.
bool IsConvertingQuadListsToTriangleLists() const {
return convert_quad_lists_to_triangle_lists_;
}
// Submission must be open to call (may request the index buffer in the shared
// memory).
bool Process(ProcessingResult& result_out);
// Invalidates the cache within the range.
std::pair<uint32_t, uint32_t> MemoryInvalidationCallback(
uint32_t physical_address_start, uint32_t length, bool exact_range);
protected:
// For host-side index buffer creation, the biggest possibly needed contiguous
// allocation, in indices.
// - No conversion: up to 0xFFFF vertices (as the vertex count in
// VGT_DRAW_INITIATOR is 16-bit).
// - Triangle fans to lists: since the 3rd vertex, every guest vertex creates
// a triangle, thus the maximum is 3 * (UINT16_MAX - 2), or 0x2FFF7.
// Primitive reset can only slow down the amplification - the 3 vertices
// after a reset add 1 host vertex each, not 3 each.
// - Line loops to strips: adding 1 vertex if there are at least 2 vertices in
// the original primitive, either replacing the primitive reset index with
// this new closing vertex, or in case of the final primitive, just adding a
// vertex - thus the absolute limit is UINT16_MAX + 1, or 0x10000.
// - Quad lists to triangle lists: vertices are processed in groups of 4, each
// group converted to 6 vertices, so the limit is 1.5 * 0xFFFC, or 0x17FFA.
// Thus, the maximum vertex count is defined by triangle fan to list
// conversion.
// Also include padding for co-alignment of the source and the destination for
// SIMD.
static constexpr uint32_t kMinRequiredConvertedIndexBufferSize =
sizeof(uint32_t) * (UINT16_MAX - 2) * 3 *
+XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE;
PrimitiveProcessor(const RegisterFile& register_file, Memory& memory,
TraceWriter& trace_writer, SharedMemory& shared_memory)
: register_file_(register_file),
memory_(memory),
trace_writer_(trace_writer),
shared_memory_(shared_memory) {}
// Call from the backend-specific initialization function.
// - full_32bit_vertex_indices_supported:
// - If the backend supports 32-bit indices unconditionally, and doesn't
// generate indirection logic in vertex shaders, pass hard-coded `true`.
// - Otherwise:
// - If the host doesn't support full 32-bit indices (but supports at
// least 24-bit indices), pass `false`.
// - If the host supports 32-bit indices, but the backend can handle both
// cases, pass `cvars::ignore_32bit_vertex_index_support`, and
// afterwards, check `AreFull32BitVertexIndicesUsed()` externally to see
// if indirection may be needed.
// - When full 32-bit indices are not supported, the host must be using
// auto-indexed draws for 32-bit indices of ProcessedIndexBufferType
// kGuest, while fetching the index data manually from the shared memory
// buffer and endian-swapping it.
// - Indirection, however, precludes primitive reset usage - so if
// primitive reset is needed, the primitive processor will pre-swap and
// pre-mask the index buffer so there are only host-endian 0x00###### or
// 0xFFFFFFFF values in it. In this case, a kHostConverted index buffer
// is returned from Process, and indirection is not needed (and
// impossible since the index buffer is not in the shared memory buffer
// anymore), though byte swap is still needed as 16-bit indices may also
// be kHostConverted, while they are completely unaffected by this. The
// same applies to primitive type conversion - if it happens for 32-bit
// guest indices, and kHostConverted is returned, they will be
// pre-swapped and pre-masked.
// - triangle_fans_supported, line_loops_supported, quad_lists_supported:
// - Pass true or false depending on whether the host actually supports
// those guest primitive types directly or through geometry shader
// emulation. Debug overriding will be resolved in the common code if
// needed.
bool InitializeCommon(bool full_32bit_vertex_indices_supported,
bool triangle_fans_supported, bool line_loops_supported,
bool quad_lists_supported);
// If any primitive type conversion is needed for auto-indexed draws, called
// from InitializeCommon (thus only once in the primitive processor's
// lifetime) to set up the backend's index buffer containing indices for
// primitive type remapping. The backend must allocate a `sizeof(uint16_t) *
// index_count` buffer and call fill_callback for its mapping if creation is
// successful. 16-bit indices are enough even if the backend has primitive
// reset enabled all the time (Metal) as auto-indexed draws are limited to
// UINT16_MAX vertices, not UINT16_MAX + 1.
virtual bool InitializeBuiltin16BitIndexBuffer(
uint32_t index_count, std::function<void(uint16_t*)> fill_callback) = 0;
// Call last in implementation-specific shutdown, also callable from the
// destructor.
void ShutdownCommon();
// Call at boundaries of lifespans of converted data (between frames,
// preferably in the end of a frame so between the swap and the next draw,
// access violation handlers need to do less work).
void ClearPerFrameCache();
static constexpr size_t GetBuiltinIndexBufferOffsetBytes(size_t handle) {
// For simplicity, just using the handles as byte offsets.
return handle;
}
// The destination allocation must have XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
// excess bytes.
static ptrdiff_t GetSimdCoalignmentOffset(const void* host_index_ptr,
uint32_t guest_index_base) {
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
// Always moving the host pointer only forward into the allocation padding
// space of XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE bytes. Without relying on
// two's complement wrapping overflow behavior, the logic would look like:
// uintptr_t host_subalignment =
// reinterpret_cast<uintptr_t>(host_index_ptr) &
// (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1);
// uint32_t guest_subalignment = guest_index_base &
// (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1);
// uintptr_t host_index_address_aligned = host_index_address;
// if (guest_subalignment >= host_subalignment) {
// return guest_subalignment - host_subalignment;
// }
// return XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE -
// (host_subalignment - guest_subalignment);
return ptrdiff_t(
(guest_index_base - reinterpret_cast<uintptr_t>(host_index_ptr)) &
(XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1));
#else
return 0;
#endif
}
// Requests a buffer to write the new transformed indices to. The lifetime of
// the returned buffer must be that of the current frame. Returns the mapping
// of the buffer to write to, or nullptr in case of failure, in addition to,
// if successful, a handle that can be used by the backend's command processor
// to access the backend-specific data for binding the buffer.
virtual void* RequestHostConvertedIndexBufferForCurrentFrame(
xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,
uint32_t coalignment_original_address, size_t& backend_handle_out) = 0;
private:
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
#if XE_ARCH_AMD64
// SSSE3 or AVX.
using SimdVectorU16 = __m128i;
using SimdVectorU32 = __m128i;
static SimdVectorU16 ReplicateU16(uint16_t value) {
return _mm_set1_epi16(int16_t(value));
}
static SimdVectorU32 ReplicateU32(uint32_t value) {
return _mm_set1_epi32(int32_t(value));
}
static SimdVectorU16 LoadAlignedVectorU16(const uint16_t* source) {
return _mm_load_si128(reinterpret_cast<const __m128i*>(source));
}
static SimdVectorU32 LoadAlignedVectorU32(const uint32_t* source) {
return _mm_load_si128(reinterpret_cast<const __m128i*>(source));
}
static void StoreUnalignedVectorU16(uint16_t* dest, SimdVectorU16 source) {
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest), source);
}
static void StoreUnalignedVectorU32(uint32_t* dest, SimdVectorU32 source) {
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest), source);
}
#elif XE_ARCH_ARM64
// NEON.
using SimdVectorU16 = uint16x8_t;
using SimdVectorU32 = uint32x4_t;
static SimdVectorU16 ReplicateU16(uint16_t value) {
return vdupq_n_u16(value);
}
static SimdVectorU32 ReplicateU32(uint32_t value) {
return vdupq_n_u32(value);
}
static SimdVectorU16 LoadAlignedVectorU16(const uint16_t* source) {
#if XE_COMPILER_MSVC
return vld1q_u16_ex(source, sizeof(uint16x8_t) * CHAR_BIT);
#else
return vld1q_u16(reinterpret_cast<const uint16_t*>(
__builtin_assume_aligned(source, sizeof(uint16x8_t))));
#endif
}
static SimdVectorU32 LoadAlignedVectorU32(const uint32_t* source) {
#if XE_COMPILER_MSVC
return vld1q_u32_ex(source, sizeof(uint16x8_t) * CHAR_BIT);
#else
return vld1q_u32(reinterpret_cast<const uint32_t*>(
__builtin_assume_aligned(source, sizeof(uint32x4_t))));
#endif
}
static void StoreUnalignedVectorU16(uint16_t* dest, SimdVectorU16 source) {
vst1q_u16(dest, source);
}
static void StoreUnalignedVectorU32(uint32_t* dest, SimdVectorU32 source) {
vst1q_u32(dest, source);
}
#else
#error SIMD vector types and constant loads not specified.
#endif // XE_ARCH
static_assert(
sizeof(SimdVectorU16) == XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE,
"XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE must reflect the vector size "
"actually used");
static_assert(
sizeof(SimdVectorU32) == XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE,
"XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE must reflect the vector size "
"actually used");
static constexpr uint32_t kSimdVectorU16Elements =
sizeof(SimdVectorU16) / sizeof(uint16_t);
static constexpr uint32_t kSimdVectorU32Elements =
sizeof(SimdVectorU32) / sizeof(uint32_t);
#endif // XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
static bool IsResetUsed(const uint16_t* source, uint32_t count,
uint16_t reset_index_guest_endian);
static void Get16BitResetIndexUsage(const uint16_t* source, uint32_t count,
uint16_t reset_index_guest_endian,
bool& is_reset_index_used_out,
bool& is_ffff_used_as_vertex_index_out);
static bool IsResetUsed(const uint32_t* source, uint32_t count,
uint32_t reset_index_guest_endian,
uint32_t low_bits_mask_guest_endian);
static void ReplaceResetIndex16To16(uint16_t* dest, const uint16_t* source,
uint32_t count,
uint16_t reset_index_guest_endian);
// For use when the reset index is not 0xFFFF, and 0xFFFF is also used as a
// valid index - keeps 0xFFFF as a real index and replaces the reset index
// with 0xFFFFFFFF instead.
static void ReplaceResetIndex16To24(uint32_t* dest, const uint16_t* source,
uint32_t count,
uint16_t reset_index_guest_endian);
// The reset index and the low 24 bits mask are taken explicitly because this
// function may be used two ways:
// - Passthrough - when the vertex shader swaps the indices (when 32-bit
// indices are supported on the host), in this case HostSwap is kNone, but
// the reset index and the guest low bits mask can be swapped according to
// the guest endian.
// - Swapping for the host - when only 24 bits of an index are supported on
// the host. In this case, masking and comparison are done before applying
// HostSwap, but according to HostSwap, if needed, the data is swapped from
// the PowerPC's big endianness to the host GPU little endianness that we
// assume, which matches the Xenos's little endianness.
template <xenos::Endian HostSwap>
static void ReplaceResetIndex32To24(uint32_t* dest, const uint32_t* source,
uint32_t count,
uint32_t reset_index_guest_endian,
uint32_t low_bits_mask_guest_endian) {
// The Xbox 360's GPU only uses the low 24 bits of the index - masking.
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
while (count && (reinterpret_cast<uintptr_t>(source) &
(XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1))) {
--count;
uint32_t index = *(source++) & low_bits_mask_guest_endian;
*(dest++) = index != reset_index_guest_endian
? xenos::GpuSwap(index, HostSwap)
: UINT32_MAX;
}
if (count >= kSimdVectorU32Elements) {
SimdVectorU32 reset_index_guest_endian_simd =
ReplicateU32(reset_index_guest_endian);
SimdVectorU32 low_bits_mask_guest_endian_simd =
ReplicateU32(low_bits_mask_guest_endian);
#if XE_ARCH_AMD64
__m128i host_swap_shuffle;
if constexpr (HostSwap != xenos::Endian::kNone) {
host_swap_shuffle = _mm_set_epi32(
int32_t(xenos::GpuSwap(uint32_t(0x0F0E0D0C), HostSwap)),
int32_t(xenos::GpuSwap(uint32_t(0x0B0A0908), HostSwap)),
int32_t(xenos::GpuSwap(uint32_t(0x07060504), HostSwap)),
int32_t(xenos::GpuSwap(uint32_t(0x03020100), HostSwap)));
}
#endif // XE_ARCH_AMD64
while (count >= kSimdVectorU32Elements) {
count -= kSimdVectorU32Elements;
// Comparison produces 0 or 0xFFFF on AVX and Neon - we need 0xFFFF as
// the result for the primitive reset indices, so the result is
// `index | (index == reset_index)`.
SimdVectorU32 source_simd = LoadAlignedVectorU32(source);
source += kSimdVectorU32Elements;
SimdVectorU32 result_simd;
#if XE_ARCH_AMD64
source_simd =
_mm_and_si128(source_simd, low_bits_mask_guest_endian_simd);
result_simd = _mm_or_si128(
source_simd,
_mm_cmpeq_epi32(source_simd, reset_index_guest_endian_simd));
if constexpr (HostSwap != xenos::Endian::kNone) {
result_simd = _mm_shuffle_epi8(result_simd, host_swap_shuffle);
}
#elif XE_ARCH_ARM64
source_simd = vandq_u32(source_simd, low_bits_mask_guest_endian_simd);
result_simd = vorrq_u32(
source_simd, vceqq_u32(source_simd, reset_index_guest_endian_simd));
if constexpr (HostSwap == xenos::Endian::k8in16) {
result_simd = vreinterpretq_u32_u8(
vrev16q_u8(vreinterpretq_u8_u32(result_simd)));
} else if constexpr (HostSwap == xenos::Endian::k8in32) {
result_simd = vreinterpretq_u32_u8(
vrev32q_u8(vreinterpretq_u8_u32(result_simd)));
} else if constexpr (HostSwap == xenos::Endian::k16in32) {
result_simd = vreinterpretq_u32_u16(
vrev32q_u16(vreinterpretq_u16_u32(result_simd)));
}
#else
#error SIMD ReplaceResetIndex32To24 not implemented.
#endif // XE_ARCH
StoreUnalignedVectorU32(dest, result_simd);
dest += kSimdVectorU32Elements;
}
}
#endif // XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
while (count--) {
uint32_t index = *(source++) & low_bits_mask_guest_endian;
*(dest++) = index != reset_index_guest_endian
? xenos::GpuSwap(index, HostSwap)
: UINT32_MAX;
}
}
// TODO(Triang3l): 16-bit > 32-bit primitive type conversion for Metal, where
// primitive reset is always enabled, if UINT16_MAX is used as a real vertex
// index.
struct PassthroughIndexTransform {
uint16_t operator()(uint16_t index) const { return index; }
uint32_t operator()(uint32_t index) const { return index; }
};
struct To24NonSwappingIndexTransform {
uint32_t operator()(uint32_t index) const {
return index & xenos::kVertexIndexMask;
}
};
struct To24Swapping8In16IndexTransform {
uint32_t operator()(uint32_t index) const {
return xenos::GpuSwap(index, xenos::Endian::k8in16) &
xenos::kVertexIndexMask;
}
};
struct To24Swapping8In32IndexTransform {
uint32_t operator()(uint32_t index) const {
return xenos::GpuSwap(index, xenos::Endian::k8in32) &
xenos::kVertexIndexMask;
}
};
struct To24Swapping16In32IndexTransform {
uint32_t operator()(uint32_t index) const {
return xenos::GpuSwap(index, xenos::Endian::k16in32) &
xenos::kVertexIndexMask;
}
};
// Triangle fans as triangle lists.
// Ordered as (v1, v2, v0), (v2, v3, v0) in Direct3D.
// https://docs.microsoft.com/en-us/windows/desktop/direct3d9/triangle-fans
static constexpr uint32_t GetTriangleFanListIndexCount(
uint32_t fan_index_count) {
return fan_index_count > 2 ? (fan_index_count - 2) * 3 : 0;
}
template <typename Index, typename IndexTransform>
static void TriangleFanToList(Index* dest, const Index* source,
uint32_t source_index_count,
const IndexTransform& index_transform) {
if (source_index_count <= 2) {
// To match GetTriangleFanListIndexCount.
return;
}
Index index_first = index_transform(source[0]);
Index index_previous = index_transform(source[1]);
for (uint32_t i = 2; i < source_index_count; ++i) {
Index index_current = index_transform(source[i]);
*(dest++) = index_previous;
*(dest++) = index_current;
*(dest++) = index_first;
index_previous = index_current;
}
}
static constexpr uint32_t GetLineLoopStripIndexCount(
uint32_t loop_index_count) {
// Even if 2 vertices are supplied, two lines are still drawn between them.
// https://www.khronos.org/opengl/wiki/Primitive
// "You get n lines for n input vertices"
// "If the user only specifies 1 vertex, the drawing command is ignored"
return loop_index_count > 1 ? loop_index_count + 1 : 0;
}
template <typename Index, typename IndexTransform>
static void LineLoopToStrip(Index* dest, const Index* source,
uint32_t source_index_count,
const IndexTransform& index_transform) {
if (source_index_count <= 1) {
// To match GetLineLoopStripIndexCount.
return;
}
Index index_first = index_transform(source[0]);
dest[0] = index_first;
for (uint32_t i = 1; i < source_index_count; ++i) {
dest[i] = index_transform(source[i]);
}
dest[source_index_count] = index_first;
}
static void LineLoopToStrip(uint16_t* dest, const uint16_t* source,
uint32_t source_index_count,
const PassthroughIndexTransform& index_transform);
static void LineLoopToStrip(uint32_t* dest, const uint32_t* source,
uint32_t source_index_count,
const PassthroughIndexTransform& index_transform);
static constexpr uint32_t GetQuadListTriangleListIndexCount(
uint32_t quad_list_index_count) {
return (quad_list_index_count / 4) * 6;
}
template <typename Index, typename IndexTransform>
static void QuadListToTriangleList(Index* dest, const Index* source,
uint32_t source_index_count,
const IndexTransform& index_transform) {
uint32_t quad_count = source_index_count / 4;
for (uint32_t i = 0; i < quad_count; ++i) {
// TODO(Triang3l): Find the correct order.
// v0, v1, v2.
Index common_index_0 = index_transform(*(source++));
*(dest++) = common_index_0;
*(dest++) = index_transform(*(source++));
Index common_index_2 = index_transform(*(source++));
*(dest++) = common_index_2;
// v0, v2, v3.
*(dest++) = common_index_0;
*(dest++) = common_index_2;
*(dest++) = index_transform(*(source++));
}
}
// Pre-gathering the ranges allows for usage of the same functions for
// conversion with and without reset. In addition, this increases safety in
// weird cases - there won't be mismatch between the pre-calculation of the
// post-conversion index count and the actual conversion if the game for some
// reason modifies the index buffer between the two and adds or removes reset
// indices in it.
struct SinglePrimitiveRange {
SinglePrimitiveRange(uint32_t guest_offset, uint32_t guest_index_count,
uint32_t host_index_count)
: guest_offset(guest_offset),
guest_index_count(guest_index_count),
host_index_count(host_index_count) {}
uint32_t guest_offset;
uint32_t guest_index_count;
uint32_t host_index_count;
};
static uint32_t GetMultiPrimitiveHostIndexCountAndRanges(
std::function<uint32_t(uint32_t)> single_primitive_guest_to_host_count,
const uint16_t* source, uint32_t source_index_count,
uint16_t reset_index_guest_endian,
std::deque<SinglePrimitiveRange>& ranges_append_out);
static uint32_t GetMultiPrimitiveHostIndexCountAndRanges(
std::function<uint32_t(uint32_t)> single_primitive_guest_to_host_count,
const uint32_t* source, uint32_t source_index_count,
uint32_t reset_index_guest_endian, uint32_t low_bits_mask_guest_endian,
std::deque<SinglePrimitiveRange>& ranges_append_out);
template <typename Index, typename IndexTransform,
typename PrimitiveRangeIterator>
static void ConvertSinglePrimitiveRanges(
Index* dest, const Index* source,
xenos::PrimitiveType source_primitive_type,
const IndexTransform& index_transform,
PrimitiveRangeIterator ranges_beginning,
PrimitiveRangeIterator ranges_end) {
Index* dest_write_ptr = dest;
switch (source_primitive_type) {
case xenos::PrimitiveType::kTriangleFan:
for (PrimitiveRangeIterator range_it = ranges_beginning;
range_it != ranges_end; ++range_it) {
TriangleFanToList(dest_write_ptr, source + range_it->guest_offset,
range_it->guest_index_count, index_transform);
dest_write_ptr += range_it->host_index_count;
}
break;
case xenos::PrimitiveType::kLineLoop:
for (PrimitiveRangeIterator range_it = ranges_beginning;
range_it != ranges_end; ++range_it) {
LineLoopToStrip(dest_write_ptr, source + range_it->guest_offset,
range_it->guest_index_count, index_transform);
dest_write_ptr += range_it->host_index_count;
}
break;
case xenos::PrimitiveType::kQuadList:
for (PrimitiveRangeIterator range_it = ranges_beginning;
range_it != ranges_end; ++range_it) {
QuadListToTriangleList(dest_write_ptr,
source + range_it->guest_offset,
range_it->guest_index_count, index_transform);
dest_write_ptr += range_it->host_index_count;
}
break;
default:
assert_unhandled_case(source_primitive_type);
}
}
const RegisterFile& register_file_;
Memory& memory_;
TraceWriter& trace_writer_;
SharedMemory& shared_memory_;
bool full_32bit_vertex_indices_used_ = false;
bool convert_triangle_fans_to_lists_ = false;
bool convert_line_loops_to_strips_ = false;
bool convert_quad_lists_to_triangle_lists_ = false;
// Byte offsets used, for simplicity, directly as handles.
size_t builtin_ib_offset_triangle_fans_to_lists_ = SIZE_MAX;
size_t builtin_ib_offset_quad_lists_to_triangle_lists_ = SIZE_MAX;
std::deque<SinglePrimitiveRange> single_primitive_ranges_;
// Caching for reuse of converted indices within a frame.
// 256 KB as the largest possible guest index buffer - 0xFFFF 32-bit indices -
// is slightly smaller than 256 KB, thus cache entries need store links within
// at most 2 buckets.
static constexpr uint32_t kCacheBucketSizeBytesLog2 = 18;
static constexpr uint32_t kCacheBucketSizeBytes =
uint32_t(1) << kCacheBucketSizeBytesLog2;
static constexpr uint32_t kCacheBucketCount =
xe::align(SharedMemory::kBufferSize, kCacheBucketSizeBytes) /
kCacheBucketSizeBytes;
union CacheKey {
struct {
uint32_t base; // 32 total
uint32_t count : 16; // 48
xenos::IndexFormat format : 1; // 49
xenos::Endian endian : 2; // 52
uint32_t is_reset_enabled : 1; // 53
// kNone if not changing the type (like only processing the reset index).
xenos::PrimitiveType conversion_guest_primitive_type : 6; // 59
};
uint64_t key = 0;
CacheKey() = default;
CacheKey(uint32_t base, uint32_t count, xenos::IndexFormat format,
xenos::Endian endian, bool is_reset_enabled,
xenos::PrimitiveType conversion_guest_primitive_type =
xenos::PrimitiveType::kNone)
: base(base),
count(count),
format(format),
endian(endian),
is_reset_enabled(is_reset_enabled),
conversion_guest_primitive_type(conversion_guest_primitive_type) {}
struct Hasher {
size_t operator()(const CacheKey& key) const {
return std::hash<uint64_t>{}(key.key);
}
};
bool operator==(const CacheKey& other_key) const {
return key == other_key.key;
}
uint32_t GetSizeBytes() const {
return count * (format == xenos::IndexFormat::kInt16 ? sizeof(uint16_t)
: sizeof(uint32_t));
}
};
// Subset of ConversionResult that can be reused for different primitive types
// if the same result is used irrespective of one (like when only processing
// the reset index).
struct CachedResult {
uint32_t host_draw_vertex_count;
ProcessedIndexBufferType index_buffer_type;
xenos::IndexFormat host_index_format;
xenos::Endian host_index_endian;
bool host_primitive_reset_enabled;
size_t host_index_buffer_handle;
};
struct CacheEntry {
static_assert(
UINT16_MAX * sizeof(uint32_t) <=
(size_t(1) << kCacheBucketSizeBytesLog2),
"Assuming that primitive processor cache entries need to store to the "
"previous and to the next entries only within up to 2 buckets, so the "
"size of the cache buckets must be not smaller than the maximum guest "
"index buffer size");
union {
size_t free_next;
size_t buckets_prev[2];
};
size_t buckets_next[2];
CacheKey key;
CachedResult result;
static uint32_t GetBucketCount(CacheKey key) {
uint32_t count =
((key.base + (key.GetSizeBytes() - 1)) >> kCacheBucketSizeBytesLog2) -
(key.base >> kCacheBucketSizeBytesLog2) + 1;
assert_true(count <= 2,
"Cache entries only store list links within two buckets");
return count;
}
uint32_t GetBucketCount() const { return GetBucketCount(key); }
};
// A cache transaction performs a few operations in a RAII-like way (so
// processing may return an error for any reason, and won't have to clean up
// cache_currently_processing_base_ / size_bytes_ explicitly):
// - Transaction initialization:
// - Lookup of previously processed indices in the cache.
// - If not found, beginning to add a new entry that is going to be
// processed:
// - Marking the range as currently being processed, for slightly safer
// race condition handling if one happens - if invalidation happens
// during the transaction (but outside a global critical region lock,
// since processing may take a long time), the new cache entry won't be
// stored as it will already be invalid at the time of the completion of
// the transaction.
// - Enabling an access callback for the range.
// - Setting the new result after processing (if not found in the cache
// previously).
// - Transaction completion:
// - If the range wasn't invalidated during the transaction, storing the new
// entry in the cache.
// If an entry was found in the cache (GetFoundResult results non-null), it
// MUST be used instead of processing - this class doesn't provide the
// possibility replace existing entries.
class CacheTransaction final {
public:
CacheTransaction(PrimitiveProcessor& processor, CacheKey key);
const CachedResult* GetFoundResult() const {
return result_type_ == ResultType::kExisting ? &result_ : nullptr;
}
void SetNewResult(const CachedResult& new_result) {
// Replacement of an existing entry is not allowed.
assert_true(result_type_ != ResultType::kExisting);
result_ = new_result;
result_type_ = ResultType::kNewSet;
}
~CacheTransaction();
private:
PrimitiveProcessor& processor_;
// If key_.count == 0, this transaction shouldn't do anything - for empty
// ranges it's pointless, and it's unsafe to get the end pointer without
// special logic, and count == 0 is also used as a special indicator for
// vertex count below the cache usage threshold.
CacheKey key_;
CachedResult result_;
enum class ResultType {
kNewUnset,
kNewSet,
kExisting,
};
ResultType result_type_ = ResultType::kNewUnset;
};
std::deque<CacheEntry> cache_entry_pool_;
void* memory_invalidation_callback_handle_ = nullptr;
xe::global_critical_region global_critical_region_;
// Modified by both the processor and the invalidation callback.
std::unordered_map<CacheKey, size_t, CacheKey::Hasher> cache_map_;
// The conversion is performed while the lock is released since it may take a
// long time.
// If during the conversion the region currently being converted is
// invalidated, the current entry will not be added to the cache.
// Modified by the processor, read by the invalidation callback.
uint32_t cache_currently_processing_base_ = 0;
// 0 if not in a cache transaction that hasn't found an existing entry
// currently.
uint32_t cache_currently_processing_size_bytes_ = 0;
// Modified by both the processor and the invalidation callback.
size_t cache_bucket_free_first_entry_ = SIZE_MAX;
// Modified by both the processor and the invalidation callback.
uint64_t cache_buckets_non_empty_l1_[(kCacheBucketCount + 63) / 64] = {};
// For even faster handling of memory invalidation - whether any bit is set in
// each cache_buckets_non_empty_l1_.
// Modified by both the processor and the invalidation callback.
uint64_t cache_buckets_non_empty_l2_[(kCacheBucketCount + (64 * 64 - 1)) /
(64 * 64)] = {};
// Must be called in a global critical region.
void UpdateCacheBucketsNonEmptyL2(
uint32_t bucket_index_div_64,
[[maybe_unused]] const std::unique_lock<std::recursive_mutex>&
global_lock) {
uint64_t& cache_buckets_non_empty_l2_ref =
cache_buckets_non_empty_l2_[bucket_index_div_64 >> 6];
uint64_t cache_buckets_non_empty_l2_bit = uint64_t(1)
<< (bucket_index_div_64 & 63);
if (cache_buckets_non_empty_l1_[bucket_index_div_64]) {
cache_buckets_non_empty_l2_ref |= cache_buckets_non_empty_l2_bit;
} else {
cache_buckets_non_empty_l2_ref &= ~cache_buckets_non_empty_l2_bit;
}
}
// cache_buckets_non_empty_l1_ (along with cache_buckets_non_empty_l2_, which
// must be kept in sync) used for indication whether each entry is non-empty,
// for faster clearing (there's no special index here for an empty entry).
// Huge, so it's the last in the class.
// Modified by both the processor and the invalidation callback.
size_t cache_bucket_first_entries_[kCacheBucketCount];
static std::pair<uint32_t, uint32_t> MemoryInvalidationCallbackThunk(
void* context_ptr, uint32_t physical_address_start, uint32_t length,
bool exact_range);
};
} // namespace gpu
} // namespace xe
#endif // XENIA_GPU_PRIMITIVE_PROCESSOR_H_