xenia-canary/src/xenia/gpu/primitive_processor.h

/**
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
 * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */

#ifndef XENIA_GPU_PRIMITIVE_PROCESSOR_H_
#define XENIA_GPU_PRIMITIVE_PROCESSOR_H_

#include <climits>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <deque>
#include <functional>
#include <mutex>
#include <unordered_map>
#include <utility>

#include "xenia/base/assert.h"
#include "xenia/base/cvar.h"
#include "xenia/base/math.h"
#include "xenia/base/mutex.h"
#include "xenia/base/platform.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/shared_memory.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/xenos.h"
#include "xenia/memory.h"

#if XE_ARCH_AMD64
// 128-bit SSSE3-level (SSE2+ for integer comparison, SSSE3 for pshufb) or AVX
// (256-bit AVX only got integer operations such as comparison in AVX2, which is
// above the minimum requirements of Xenia).
#include <tmmintrin.h>
#define XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE 16
#elif XE_ARCH_ARM64
#include <arm_neon.h>
#define XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE 16
#else
#define XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE 0
#endif  // XE_ARCH

// The idea behind this config variable is to force both indirection without
// primitive reset and pre-masking / pre-swapping with primitive reset,
// therefore this is supposed to be checked only by the host if it supports
// indirection. It's pretty pointless to do only half of this on backends that
// support full 32-bit indices unconditionally.
DECLARE_bool(ignore_32bit_vertex_index_support);

namespace xe {
namespace gpu {

// Normalizes primitive data in various ways for use with Direct3D 12 and Vulkan
// (down to its minimum requirements plus the portability subset).
//
// This solves various issues:
// - Triangle fans not supported on Direct3D 10+ and the Vulkan portability
//   subset.
//   - Converts to triangle lists, both with and without primitive reset.
// - Line loops are not supported on Direct3D 12 or Vulkan.
//   - Converts to line strips.
// - Quads not reproducible with line lists with adjacency without geometry
//   shaders (some Vulkan implementations), as well as being hard to debug in
//   PIX due to "catastrophic failures".
//   - Converts to triangle lists.
// - Vulkan requiring 0xFFFF primitive restart index for 16-bit indices and
//   0xFFFFFFFF for 32-bit (Direct3D 12 slightly relaxes this, allowing 0xFFFF
//   for 32-bit also, but it's of no use to Xenia since guest indices are
//   big-endian usually. Also, only 24 lower bits of the vertex index being used
//   on the guest (tested on an Adreno 200 phone with drawing, though not with
//   primitive restart as OpenGL ES 2.0 doesn't expose it), so the upper 8 bits
//   likely shouldn't have effect on primitive restart (guest reset index
//   0xFFFFFF likely working for 0xFFFFFF, 0xFFFFFFFF, and 254 more indices),
//   while Vulkan and Direct3D 12 require exactly 0xFFFFFFFF.
//   - For 16-bit indices with guest reset index other than 0xFFFF (passing
//     0xFFFF directly to the host is fine because it's the same irrespective of
//     endianness), there are two possible solutions:
//     - If the index buffer otherwise doesn't contain 0xFFFF otherwise (since
//       it's a valid vertex index in this case), replacing the primitive reset
//       index with 0xFFFF in the 16-bit buffer.
//     - If the index buffer contains any usage of 0xFFFF as a real vertex
//       index, converting the index buffer to 32-bit, and replacing the
//       primitive reset index with 0xFFFFFFFF.
//   - For 32-bit indices, there are two paths:
//     - If the guest reset index is 0xFFFFFF, and the index buffer actually
//       uses only 0xFFFFFFFF for reset, using it without changes.
//     - If the guest uses something other than 0xFFFFFFFF for primitive reset,
//       replacing elements with (index & 0xFFFFFF) == reset_index with
//       0xFFFFFFFF.
// - Some Vulkan implementations only support 24-bit indices. The guests usually
//   pass big-endian vertices, so we need all 32 bits (as the least significant
//   bits will be in 24...31) to perform the byte swapping. For this reason, we
//   load 32-bit indices indirectly, doing non-indexed draws and fetching the
//   indices from the shared memory. This, however, is not compatible with
//   primitive restart.
//   - Pre-swapping, masking to 24 bits, and converting the reset index to
//     0xFFFFFFFF, resulting in an index buffer that can be used directly.

class PrimitiveProcessor {
 public:
  enum ProcessedIndexBufferType {
    // Auto-indexed on the host.
    kNone,
    // GPU DMA, from the shared memory.
    // For 32-bit, indirection is needed if the host only supports 24-bit
    // indices (even for non-endian-swapped, as the GPU should be ignoring the
    // upper 8 bits completely, rather than exhibiting undefined behavior.
    kGuest,
    // Converted and stored in the primitive converter for the current draw
    // command. For 32-bit indices, if the host doesn't support all 32 bits,
    // this kind of an index buffer will always be pre-masked and pre-swapped.
    kHostConverted,
    // Auto-indexed on the guest, but with an adapter index buffer on the host.
    kHostBuiltin,
  };

  struct ProcessingResult {
    xenos::PrimitiveType guest_primitive_type;
    xenos::PrimitiveType host_primitive_type;
    // Includes whether tessellation is enabled (not kVertex) and the type of
    // tessellation.
    Shader::HostVertexShaderType host_vertex_shader_type;
    // Only used for non-kVertex host_vertex_shader_type. For kAdaptive, the
    // index buffer is always from the guest and fully 32-bit, and contains the
    // floating-point tessellation factors.
    xenos::TessellationMode tessellation_mode;
    // TODO(Triang3l): If important, split into the index count and the actual
    // index buffer size, using zeros for out-of-bounds indices.
    uint32_t host_draw_vertex_count;
    uint32_t line_loop_closing_index;
    ProcessedIndexBufferType index_buffer_type;
    uint32_t guest_index_base;
    xenos::IndexFormat host_index_format;
    xenos::Endian host_index_endian;
    // The reset index, if enabled, is always 0xFFFF for host_index_format
    // kInt16 and 0xFFFFFFFF for kInt32.
    bool host_primitive_reset_enabled;
    // Backend-specific handle for the index buffer valid for the current draw,
    // only valid for index_buffer_type kHostConverted and kHostBuiltin.
    size_t host_index_buffer_handle;
    bool IsTessellated() const {
      return host_vertex_shader_type != Shader::HostVertexShaderType::kVertex;
    }
  };

  virtual ~PrimitiveProcessor();

  bool AreFull32BitVertexIndicesUsed() const {
    return full_32bit_vertex_indices_used_;
  }
  bool IsConvertingTriangleFansToLists() const {
    return convert_triangle_fans_to_lists_;
  }
  bool IsConvertingLineLoopsToStrips() const {
    return convert_line_loops_to_strips_;
  }
  // Quad lists may be emulated as line lists with adjacency and a geometry
  // shader, but geometry shaders must be supported for this.
  bool IsConvertingQuadListsToTriangleLists() const {
    return convert_quad_lists_to_triangle_lists_;
  }

  // Submission must be open to call (may request the index buffer in the shared
  // memory).
  bool Process(ProcessingResult& result_out);

  // Invalidates the cache within the range.
  std::pair<uint32_t, uint32_t> MemoryInvalidationCallback(
      uint32_t physical_address_start, uint32_t length, bool exact_range);

 protected:
  // For host-side index buffer creation, the biggest possibly needed contiguous
  // allocation, in indices.
  // - No conversion: up to 0xFFFF vertices (as the vertex count in
  //   VGT_DRAW_INITIATOR is 16-bit).
  // - Triangle fans to lists: since the 3rd vertex, every guest vertex creates
  //   a triangle, thus the maximum is 3 * (UINT16_MAX - 2), or 0x2FFF7.
  //   Primitive reset can only slow down the amplification - the 3 vertices
  //   after a reset add 1 host vertex each, not 3 each.
  // - Line loops to strips: adding 1 vertex if there are at least 2 vertices in
  //   the original primitive, either replacing the primitive reset index with
  //   this new closing vertex, or in case of the final primitive, just adding a
  //   vertex - thus the absolute limit is UINT16_MAX + 1, or 0x10000.
  // - Quad lists to triangle lists: vertices are processed in groups of 4, each
  //   group converted to 6 vertices, so the limit is 1.5 * 0xFFFC, or 0x17FFA.
  // Thus, the maximum vertex count is defined by triangle fan to list
  // conversion.
  // Also include padding for co-alignment of the source and the destination for
  // SIMD.
  static constexpr uint32_t kMinRequiredConvertedIndexBufferSize =
      sizeof(uint32_t) * (UINT16_MAX - 2) * 3 *
      +XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE;

  PrimitiveProcessor(const RegisterFile& register_file, Memory& memory,
                     TraceWriter& trace_writer, SharedMemory& shared_memory)
      : register_file_(register_file),
        memory_(memory),
        trace_writer_(trace_writer),
        shared_memory_(shared_memory) {}

  // Call from the backend-specific initialization function.
  // - full_32bit_vertex_indices_supported:
  //   - If the backend supports 32-bit indices unconditionally, and doesn't
  //     generate indirection logic in vertex shaders, pass hard-coded `true`.
  //   - Otherwise:
  //     - If the host doesn't support full 32-bit indices (but supports at
  //       least 24-bit indices), pass `false`.
  //     - If the host supports 32-bit indices, but the backend can handle both
  //       cases, pass `cvars::ignore_32bit_vertex_index_support`, and
  //       afterwards, check `AreFull32BitVertexIndicesUsed()` externally to see
  //       if indirection may be needed.
  //     - When full 32-bit indices are not supported, the host must be using
  //       auto-indexed draws for 32-bit indices of ProcessedIndexBufferType
  //       kGuest, while fetching the index data manually from the shared memory
  //       buffer and endian-swapping it.
  //     - Indirection, however, precludes primitive reset usage - so if
  //       primitive reset is needed, the primitive processor will pre-swap and
  //       pre-mask the index buffer so there are only host-endian 0x00###### or
  //       0xFFFFFFFF values in it. In this case, a kHostConverted index buffer
  //       is returned from Process, and indirection is not needed (and
  //       impossible since the index buffer is not in the shared memory buffer
  //       anymore), though byte swap is still needed as 16-bit indices may also
  //       be kHostConverted, while they are completely unaffected by this. The
  //       same applies to primitive type conversion - if it happens for 32-bit
  //       guest indices, and kHostConverted is returned, they will be
  //       pre-swapped and pre-masked.
  // - triangle_fans_supported, line_loops_supported, quad_lists_supported:
  //   - Pass true or false depending on whether the host actually supports
  //     those guest primitive types directly or through geometry shader
  //     emulation. Debug overriding will be resolved in the common code if
  //     needed.
  bool InitializeCommon(bool full_32bit_vertex_indices_supported,
                        bool triangle_fans_supported, bool line_loops_supported,
                        bool quad_lists_supported);
  // If any primitive type conversion is needed for auto-indexed draws, called
  // from InitializeCommon (thus only once in the primitive processor's
  // lifetime) to set up the backend's index buffer containing indices for
  // primitive type remapping. The backend must allocate a `sizeof(uint16_t) *
  // index_count` buffer and call fill_callback for its mapping if creation is
  // successful. 16-bit indices are enough even if the backend has primitive
  // reset enabled all the time (Metal) as auto-indexed draws are limited to
  // UINT16_MAX vertices, not UINT16_MAX + 1.
  virtual bool InitializeBuiltin16BitIndexBuffer(
      uint32_t index_count, std::function<void(uint16_t*)> fill_callback) = 0;
  // Call last in implementation-specific shutdown, also callable from the
  // destructor.
  void ShutdownCommon();

  // Call at boundaries of lifespans of converted data (between frames,
  // preferably in the end of a frame so between the swap and the next draw,
  // access violation handlers need to do less work).
  void ClearPerFrameCache();

  static constexpr size_t GetBuiltinIndexBufferOffsetBytes(size_t handle) {
    // For simplicity, just using the handles as byte offsets.
    return handle;
  }

  // The destination allocation must have XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
  // excess bytes.
  static ptrdiff_t GetSimdCoalignmentOffset(const void* host_index_ptr,
                                            uint32_t guest_index_base) {
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
    // Always moving the host pointer only forward into the allocation padding
    // space of XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE bytes. Without relying on
    // two's complement wrapping overflow behavior, the logic would look like:
    // uintptr_t host_subalignment =
    //     reinterpret_cast<uintptr_t>(host_index_ptr) &
    //     (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1);
    // uint32_t guest_subalignment = guest_index_base &
    //                               (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1);
    // uintptr_t host_index_address_aligned = host_index_address;
    // if (guest_subalignment >= host_subalignment) {
    //   return guest_subalignment - host_subalignment;
    // }
    // return XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE -
    //        (host_subalignment - guest_subalignment);
    return ptrdiff_t(
        (guest_index_base - reinterpret_cast<uintptr_t>(host_index_ptr)) &
        (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1));
#else
    return 0;
#endif
  }

  // Requests a buffer to write the new transformed indices to. The lifetime of
  // the returned buffer must be that of the current frame. Returns the mapping
  // of the buffer to write to, or nullptr in case of failure, in addition to,
  // if successful, a handle that can be used by the backend's command processor
  // to access the backend-specific data for binding the buffer.
  virtual void* RequestHostConvertedIndexBufferForCurrentFrame(
      xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,
      uint32_t coalignment_original_address, size_t& backend_handle_out) = 0;

 private:
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
#if XE_ARCH_AMD64
  // SSSE3 or AVX.
  using SimdVectorU16 = __m128i;
  using SimdVectorU32 = __m128i;
  static SimdVectorU16 ReplicateU16(uint16_t value) {
    return _mm_set1_epi16(int16_t(value));
  }
  static SimdVectorU32 ReplicateU32(uint32_t value) {
    return _mm_set1_epi32(int32_t(value));
  }
  static SimdVectorU16 LoadAlignedVectorU16(const uint16_t* source) {
    return _mm_load_si128(reinterpret_cast<const __m128i*>(source));
  }
  static SimdVectorU32 LoadAlignedVectorU32(const uint32_t* source) {
    return _mm_load_si128(reinterpret_cast<const __m128i*>(source));
  }
  static void StoreUnalignedVectorU16(uint16_t* dest, SimdVectorU16 source) {
    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest), source);
  }
  static void StoreUnalignedVectorU32(uint32_t* dest, SimdVectorU32 source) {
    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest), source);
  }
#elif XE_ARCH_ARM64
  // NEON.
  using SimdVectorU16 = uint16x8_t;
  using SimdVectorU32 = uint32x4_t;
  static SimdVectorU16 ReplicateU16(uint16_t value) {
    return vdupq_n_u16(value);
  }
  static SimdVectorU32 ReplicateU32(uint32_t value) {
    return vdupq_n_u32(value);
  }
  static SimdVectorU16 LoadAlignedVectorU16(const uint16_t* source) {
#if XE_COMPILER_MSVC
    return vld1q_u16_ex(source, sizeof(uint16x8_t) * CHAR_BIT);
#else
    return vld1q_u16(reinterpret_cast<const uint16_t*>(
        __builtin_assume_aligned(source, sizeof(uint16x8_t))));
#endif
  }
  static SimdVectorU32 LoadAlignedVectorU32(const uint32_t* source) {
#if XE_COMPILER_MSVC
    return vld1q_u32_ex(source, sizeof(uint16x8_t) * CHAR_BIT);
#else
    return vld1q_u32(reinterpret_cast<const uint32_t*>(
        __builtin_assume_aligned(source, sizeof(uint32x4_t))));
#endif
  }
  static void StoreUnalignedVectorU16(uint16_t* dest, SimdVectorU16 source) {
    vst1q_u16(dest, source);
  }
  static void StoreUnalignedVectorU32(uint32_t* dest, SimdVectorU32 source) {
    vst1q_u32(dest, source);
  }
#else
#error SIMD vector types and constant loads not specified.
#endif  // XE_ARCH
  static_assert(
      sizeof(SimdVectorU16) == XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE,
      "XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE must reflect the vector size "
      "actually used");
  static_assert(
      sizeof(SimdVectorU32) == XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE,
      "XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE must reflect the vector size "
      "actually used");
  static constexpr uint32_t kSimdVectorU16Elements =
      sizeof(SimdVectorU16) / sizeof(uint16_t);
  static constexpr uint32_t kSimdVectorU32Elements =
      sizeof(SimdVectorU32) / sizeof(uint32_t);
#endif  // XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE

  static bool IsResetUsed(const uint16_t* source, uint32_t count,
                          uint16_t reset_index_guest_endian);
  static void Get16BitResetIndexUsage(const uint16_t* source, uint32_t count,
                                      uint16_t reset_index_guest_endian,
                                      bool& is_reset_index_used_out,
                                      bool& is_ffff_used_as_vertex_index_out);
  static bool IsResetUsed(const uint32_t* source, uint32_t count,
                          uint32_t reset_index_guest_endian,
                          uint32_t low_bits_mask_guest_endian);
  static void ReplaceResetIndex16To16(uint16_t* dest, const uint16_t* source,
                                      uint32_t count,
                                      uint16_t reset_index_guest_endian);
  // For use when the reset index is not 0xFFFF, and 0xFFFF is also used as a
  // valid index - keeps 0xFFFF as a real index and replaces the reset index
  // with 0xFFFFFFFF instead.
  static void ReplaceResetIndex16To24(uint32_t* dest, const uint16_t* source,
                                      uint32_t count,
                                      uint16_t reset_index_guest_endian);
  // The reset index and the low 24 bits mask are taken explicitly because this
  // function may be used two ways:
  // - Passthrough - when the vertex shader swaps the indices (when 32-bit
  //   indices are supported on the host), in this case HostSwap is kNone, but
  //   the reset index and the guest low bits mask can be swapped according to
  //   the guest endian.
  // - Swapping for the host - when only 24 bits of an index are supported on
  //   the host. In this case, masking and comparison are done before applying
  //   HostSwap, but according to HostSwap, if needed, the data is swapped from
  //   the PowerPC's big endianness to the host GPU little endianness that we
  //   assume, which matches the Xenos's little endianness.
  template <xenos::Endian HostSwap>
  static void ReplaceResetIndex32To24(uint32_t* dest, const uint32_t* source,
                                      uint32_t count,
                                      uint32_t reset_index_guest_endian,
                                      uint32_t low_bits_mask_guest_endian) {
    // The Xbox 360's GPU only uses the low 24 bits of the index - masking.
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
    while (count && (reinterpret_cast<uintptr_t>(source) &
                     (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1))) {
      --count;
      uint32_t index = *(source++) & low_bits_mask_guest_endian;
      *(dest++) = index != reset_index_guest_endian
                      ? xenos::GpuSwap(index, HostSwap)
                      : UINT32_MAX;
    }
    if (count >= kSimdVectorU32Elements) {
      SimdVectorU32 reset_index_guest_endian_simd =
          ReplicateU32(reset_index_guest_endian);
      SimdVectorU32 low_bits_mask_guest_endian_simd =
          ReplicateU32(low_bits_mask_guest_endian);
#if XE_ARCH_AMD64
      __m128i host_swap_shuffle;
      if constexpr (HostSwap != xenos::Endian::kNone) {
        host_swap_shuffle = _mm_set_epi32(
            int32_t(xenos::GpuSwap(uint32_t(0x0F0E0D0C), HostSwap)),
            int32_t(xenos::GpuSwap(uint32_t(0x0B0A0908), HostSwap)),
            int32_t(xenos::GpuSwap(uint32_t(0x07060504), HostSwap)),
            int32_t(xenos::GpuSwap(uint32_t(0x03020100), HostSwap)));
      }
#endif  // XE_ARCH_AMD64
      while (count >= kSimdVectorU32Elements) {
        count -= kSimdVectorU32Elements;
        // Comparison produces 0 or 0xFFFF on AVX and Neon - we need 0xFFFF as
        // the result for the primitive reset indices, so the result is
        // `index | (index == reset_index)`.
        SimdVectorU32 source_simd = LoadAlignedVectorU32(source);
        source += kSimdVectorU32Elements;
        SimdVectorU32 result_simd;
#if XE_ARCH_AMD64
        source_simd =
            _mm_and_si128(source_simd, low_bits_mask_guest_endian_simd);
        result_simd = _mm_or_si128(
            source_simd,
            _mm_cmpeq_epi32(source_simd, reset_index_guest_endian_simd));
        if constexpr (HostSwap != xenos::Endian::kNone) {
          result_simd = _mm_shuffle_epi8(result_simd, host_swap_shuffle);
        }
#elif XE_ARCH_ARM64
        source_simd = vandq_u32(source_simd, low_bits_mask_guest_endian_simd);
        result_simd = vorrq_u32(
            source_simd, vceqq_u32(source_simd, reset_index_guest_endian_simd));
        if constexpr (HostSwap == xenos::Endian::k8in16) {
          result_simd = vreinterpretq_u32_u8(
              vrev16q_u8(vreinterpretq_u8_u32(result_simd)));
        } else if constexpr (HostSwap == xenos::Endian::k8in32) {
          result_simd = vreinterpretq_u32_u8(
              vrev32q_u8(vreinterpretq_u8_u32(result_simd)));
        } else if constexpr (HostSwap == xenos::Endian::k16in32) {
          result_simd = vreinterpretq_u32_u16(
              vrev32q_u16(vreinterpretq_u16_u32(result_simd)));
        }
#else
#error SIMD ReplaceResetIndex32To24 not implemented.
#endif  // XE_ARCH
        StoreUnalignedVectorU32(dest, result_simd);
        dest += kSimdVectorU32Elements;
      }
    }
#endif  // XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
    while (count--) {
      uint32_t index = *(source++) & low_bits_mask_guest_endian;
      *(dest++) = index != reset_index_guest_endian
                      ? xenos::GpuSwap(index, HostSwap)
                      : UINT32_MAX;
    }
  }

  // TODO(Triang3l): 16-bit > 32-bit primitive type conversion for Metal, where
  // primitive reset is always enabled, if UINT16_MAX is used as a real vertex
  // index.

  struct PassthroughIndexTransform {
    uint16_t operator()(uint16_t index) const { return index; }
    uint32_t operator()(uint32_t index) const { return index; }
  };
  struct To24NonSwappingIndexTransform {
    uint32_t operator()(uint32_t index) const {
      return index & xenos::kVertexIndexMask;
    }
  };
  struct To24Swapping8In16IndexTransform {
    uint32_t operator()(uint32_t index) const {
      return xenos::GpuSwap(index, xenos::Endian::k8in16) &
             xenos::kVertexIndexMask;
    }
  };
  struct To24Swapping8In32IndexTransform {
    uint32_t operator()(uint32_t index) const {
      return xenos::GpuSwap(index, xenos::Endian::k8in32) &
             xenos::kVertexIndexMask;
    }
  };
  struct To24Swapping16In32IndexTransform {
    uint32_t operator()(uint32_t index) const {
      return xenos::GpuSwap(index, xenos::Endian::k16in32) &
             xenos::kVertexIndexMask;
    }
  };

  // Triangle fans as triangle lists.
  // Ordered as (v1, v2, v0), (v2, v3, v0) in Direct3D.
  // https://docs.microsoft.com/en-us/windows/desktop/direct3d9/triangle-fans
  static constexpr uint32_t GetTriangleFanListIndexCount(
      uint32_t fan_index_count) {
    return fan_index_count > 2 ? (fan_index_count - 2) * 3 : 0;
  }
  template <typename Index, typename IndexTransform>
  static void TriangleFanToList(Index* dest, const Index* source,
                                uint32_t source_index_count,
                                const IndexTransform& index_transform) {
    if (source_index_count <= 2) {
      // To match GetTriangleFanListIndexCount.
      return;
    }
    Index index_first = index_transform(source[0]);
    Index index_previous = index_transform(source[1]);
    for (uint32_t i = 2; i < source_index_count; ++i) {
      Index index_current = index_transform(source[i]);
      *(dest++) = index_previous;
      *(dest++) = index_current;
      *(dest++) = index_first;
      index_previous = index_current;
    }
  }

  static constexpr uint32_t GetLineLoopStripIndexCount(
      uint32_t loop_index_count) {
    // Even if 2 vertices are supplied, two lines are still drawn between them.
    // https://www.khronos.org/opengl/wiki/Primitive
    // "You get n lines for n input vertices"
    // "If the user only specifies 1 vertex, the drawing command is ignored"
    return loop_index_count > 1 ? loop_index_count + 1 : 0;
  }
  template <typename Index, typename IndexTransform>
  static void LineLoopToStrip(Index* dest, const Index* source,
                              uint32_t source_index_count,
                              const IndexTransform& index_transform) {
    if (source_index_count <= 1) {
      // To match GetLineLoopStripIndexCount.
      return;
    }
    Index index_first = index_transform(source[0]);
    dest[0] = index_first;
    for (uint32_t i = 1; i < source_index_count; ++i) {
      dest[i] = index_transform(source[i]);
    }
    dest[source_index_count] = index_first;
  }
  static void LineLoopToStrip(uint16_t* dest, const uint16_t* source,
                              uint32_t source_index_count,
                              const PassthroughIndexTransform& index_transform);
  static void LineLoopToStrip(uint32_t* dest, const uint32_t* source,
                              uint32_t source_index_count,
                              const PassthroughIndexTransform& index_transform);

  static constexpr uint32_t GetQuadListTriangleListIndexCount(
      uint32_t quad_list_index_count) {
    return (quad_list_index_count / 4) * 6;
  }
  template <typename Index, typename IndexTransform>
  static void QuadListToTriangleList(Index* dest, const Index* source,
                                     uint32_t source_index_count,
                                     const IndexTransform& index_transform) {
    uint32_t quad_count = source_index_count / 4;
    for (uint32_t i = 0; i < quad_count; ++i) {
      // TODO(Triang3l): Find the correct order.
      // v0, v1, v2.
      Index common_index_0 = index_transform(*(source++));
      *(dest++) = common_index_0;
      *(dest++) = index_transform(*(source++));
      Index common_index_2 = index_transform(*(source++));
      *(dest++) = common_index_2;
      // v0, v2, v3.
      *(dest++) = common_index_0;
      *(dest++) = common_index_2;
      *(dest++) = index_transform(*(source++));
    }
  }

  // Pre-gathering the ranges allows for usage of the same functions for
  // conversion with and without reset. In addition, this increases safety in
  // weird cases - there won't be mismatch between the pre-calculation of the
  // post-conversion index count and the actual conversion if the game for some
  // reason modifies the index buffer between the two and adds or removes reset
  // indices in it.
  struct SinglePrimitiveRange {
    SinglePrimitiveRange(uint32_t guest_offset, uint32_t guest_index_count,
                         uint32_t host_index_count)
        : guest_offset(guest_offset),
          guest_index_count(guest_index_count),
          host_index_count(host_index_count) {}
    uint32_t guest_offset;
    uint32_t guest_index_count;
    uint32_t host_index_count;
  };
  static uint32_t GetMultiPrimitiveHostIndexCountAndRanges(
      std::function<uint32_t(uint32_t)> single_primitive_guest_to_host_count,
      const uint16_t* source, uint32_t source_index_count,
      uint16_t reset_index_guest_endian,
      std::deque<SinglePrimitiveRange>& ranges_append_out);
  static uint32_t GetMultiPrimitiveHostIndexCountAndRanges(
      std::function<uint32_t(uint32_t)> single_primitive_guest_to_host_count,
      const uint32_t* source, uint32_t source_index_count,
      uint32_t reset_index_guest_endian, uint32_t low_bits_mask_guest_endian,
      std::deque<SinglePrimitiveRange>& ranges_append_out);

  template <typename Index, typename IndexTransform,
            typename PrimitiveRangeIterator>
  static void ConvertSinglePrimitiveRanges(
      Index* dest, const Index* source,
      xenos::PrimitiveType source_primitive_type,
      const IndexTransform& index_transform,
      PrimitiveRangeIterator ranges_beginning,
      PrimitiveRangeIterator ranges_end) {
    Index* dest_write_ptr = dest;
    switch (source_primitive_type) {
      case xenos::PrimitiveType::kTriangleFan:
        for (PrimitiveRangeIterator range_it = ranges_beginning;
             range_it != ranges_end; ++range_it) {
          TriangleFanToList(dest_write_ptr, source + range_it->guest_offset,
                            range_it->guest_index_count, index_transform);
          dest_write_ptr += range_it->host_index_count;
        }
        break;
      case xenos::PrimitiveType::kLineLoop:
        for (PrimitiveRangeIterator range_it = ranges_beginning;
             range_it != ranges_end; ++range_it) {
          LineLoopToStrip(dest_write_ptr, source + range_it->guest_offset,
                          range_it->guest_index_count, index_transform);
          dest_write_ptr += range_it->host_index_count;
        }
        break;
      case xenos::PrimitiveType::kQuadList:
        for (PrimitiveRangeIterator range_it = ranges_beginning;
             range_it != ranges_end; ++range_it) {
          QuadListToTriangleList(dest_write_ptr,
                                 source + range_it->guest_offset,
                                 range_it->guest_index_count, index_transform);
          dest_write_ptr += range_it->host_index_count;
        }
        break;
      default:
        assert_unhandled_case(source_primitive_type);
    }
  }

  const RegisterFile& register_file_;
  Memory& memory_;
  TraceWriter& trace_writer_;
  SharedMemory& shared_memory_;

  bool full_32bit_vertex_indices_used_ = false;
  bool convert_triangle_fans_to_lists_ = false;
  bool convert_line_loops_to_strips_ = false;
  bool convert_quad_lists_to_triangle_lists_ = false;

  // Byte offsets used, for simplicity, directly as handles.
  size_t builtin_ib_offset_triangle_fans_to_lists_ = SIZE_MAX;
  size_t builtin_ib_offset_quad_lists_to_triangle_lists_ = SIZE_MAX;

  std::deque<SinglePrimitiveRange> single_primitive_ranges_;

  // Caching for reuse of converted indices within a frame.

  // 256 KB as the largest possible guest index buffer - 0xFFFF 32-bit indices -
  // is slightly smaller than 256 KB, thus cache entries need store links within
  // at most 2 buckets.
  static constexpr uint32_t kCacheBucketSizeBytesLog2 = 18;
  static constexpr uint32_t kCacheBucketSizeBytes =
      uint32_t(1) << kCacheBucketSizeBytesLog2;
  static constexpr uint32_t kCacheBucketCount =
      xe::align(SharedMemory::kBufferSize, kCacheBucketSizeBytes) /
      kCacheBucketSizeBytes;

  union CacheKey {
    struct {
      uint32_t base;                  // 32 total
      uint32_t count : 16;            // 48
      xenos::IndexFormat format : 1;  // 49
      xenos::Endian endian : 2;       // 52
      uint32_t is_reset_enabled : 1;  // 53
      // kNone if not changing the type (like only processing the reset index).
      xenos::PrimitiveType conversion_guest_primitive_type : 6;  // 59
    };
    uint64_t key = 0;

    CacheKey() = default;
    CacheKey(uint32_t base, uint32_t count, xenos::IndexFormat format,
             xenos::Endian endian, bool is_reset_enabled,
             xenos::PrimitiveType conversion_guest_primitive_type =
                 xenos::PrimitiveType::kNone)
        : base(base),
          count(count),
          format(format),
          endian(endian),
          is_reset_enabled(is_reset_enabled),
          conversion_guest_primitive_type(conversion_guest_primitive_type) {}

    struct Hasher {
      size_t operator()(const CacheKey& key) const {
        return std::hash<uint64_t>{}(key.key);
      }
    };
    bool operator==(const CacheKey& other_key) const {
      return key == other_key.key;
    }

    uint32_t GetSizeBytes() const {
      return count * (format == xenos::IndexFormat::kInt16 ? sizeof(uint16_t)
                                                           : sizeof(uint32_t));
    }
  };

  // Subset of ConversionResult that can be reused for different primitive types
  // if the same result is used irrespective of one (like when only processing
  // the reset index).
  struct CachedResult {
    uint32_t host_draw_vertex_count;
    ProcessedIndexBufferType index_buffer_type;
    xenos::IndexFormat host_index_format;
    xenos::Endian host_index_endian;
    bool host_primitive_reset_enabled;
    size_t host_index_buffer_handle;
  };

  struct CacheEntry {
    static_assert(
        UINT16_MAX * sizeof(uint32_t) <=
            (size_t(1) << kCacheBucketSizeBytesLog2),
        "Assuming that primitive processor cache entries need to store to the "
        "previous and to the next entries only within up to 2 buckets, so the "
        "size of the cache buckets must be not smaller than the maximum guest "
        "index buffer size");
    union {
      size_t free_next;
      size_t buckets_prev[2];
    };
    size_t buckets_next[2];
    CacheKey key;
    CachedResult result;
    static uint32_t GetBucketCount(CacheKey key) {
      uint32_t count =
          ((key.base + (key.GetSizeBytes() - 1)) >> kCacheBucketSizeBytesLog2) -
          (key.base >> kCacheBucketSizeBytesLog2) + 1;
      assert_true(count <= 2,
                  "Cache entries only store list links within two buckets");
      return count;
    }
    uint32_t GetBucketCount() const { return GetBucketCount(key); }
  };

  // A cache transaction performs a few operations in a RAII-like way (so
  // processing may return an error for any reason, and won't have to clean up
  // cache_currently_processing_base_ / size_bytes_ explicitly):
  // - Transaction initialization:
  //   - Lookup of previously processed indices in the cache.
  //   - If not found, beginning to add a new entry that is going to be
  //     processed:
  //     - Marking the range as currently being processed, for slightly safer
  //       race condition handling if one happens - if invalidation happens
  //       during the transaction (but outside a global critical region lock,
  //       since processing may take a long time), the new cache entry won't be
  //       stored as it will already be invalid at the time of the completion of
  //       the transaction.
  //     - Enabling an access callback for the range.
  // - Setting the new result after processing (if not found in the cache
  //   previously).
  // - Transaction completion:
  //   - If the range wasn't invalidated during the transaction, storing the new
  //     entry in the cache.
  // If an entry was found in the cache (GetFoundResult results non-null), it
  // MUST be used instead of processing - this class doesn't provide the
  // possibility replace existing entries.
  class CacheTransaction final {
   public:
    CacheTransaction(PrimitiveProcessor& processor, CacheKey key);
    const CachedResult* GetFoundResult() const {
      return result_type_ == ResultType::kExisting ? &result_ : nullptr;
    }
    void SetNewResult(const CachedResult& new_result) {
      // Replacement of an existing entry is not allowed.
      assert_true(result_type_ != ResultType::kExisting);
      result_ = new_result;
      result_type_ = ResultType::kNewSet;
    }
    ~CacheTransaction();

   private:
    PrimitiveProcessor& processor_;
    // If key_.count == 0, this transaction shouldn't do anything - for empty
    // ranges it's pointless, and it's unsafe to get the end pointer without
    // special logic, and count == 0 is also used as a special indicator for
    // vertex count below the cache usage threshold.
    CacheKey key_;
    CachedResult result_;
    enum class ResultType {
      kNewUnset,
      kNewSet,
      kExisting,
    };
    ResultType result_type_ = ResultType::kNewUnset;
  };

  std::deque<CacheEntry> cache_entry_pool_;

  void* memory_invalidation_callback_handle_ = nullptr;

  xe::global_critical_region global_critical_region_;
  // Modified by both the processor and the invalidation callback.
  std::unordered_map<CacheKey, size_t, CacheKey::Hasher> cache_map_;
  // The conversion is performed while the lock is released since it may take a
  // long time.
  // If during the conversion the region currently being converted is
  // invalidated, the current entry will not be added to the cache.
  // Modified by the processor, read by the invalidation callback.
  uint32_t cache_currently_processing_base_ = 0;
  // 0 if not in a cache transaction that hasn't found an existing entry
  // currently.
  uint32_t cache_currently_processing_size_bytes_ = 0;
  // Modified by both the processor and the invalidation callback.
  size_t cache_bucket_free_first_entry_ = SIZE_MAX;
  // Modified by both the processor and the invalidation callback.
  uint64_t cache_buckets_non_empty_l1_[(kCacheBucketCount + 63) / 64] = {};
  // For even faster handling of memory invalidation - whether any bit is set in
  // each cache_buckets_non_empty_l1_.
  // Modified by both the processor and the invalidation callback.
  uint64_t cache_buckets_non_empty_l2_[(kCacheBucketCount + (64 * 64 - 1)) /
                                       (64 * 64)] = {};
  // Must be called in a global critical region.
  void UpdateCacheBucketsNonEmptyL2(
      uint32_t bucket_index_div_64,
      [[maybe_unused]] const std::unique_lock<std::recursive_mutex>&
          global_lock) {
    uint64_t& cache_buckets_non_empty_l2_ref =
        cache_buckets_non_empty_l2_[bucket_index_div_64 >> 6];
    uint64_t cache_buckets_non_empty_l2_bit = uint64_t(1)
                                              << (bucket_index_div_64 & 63);
    if (cache_buckets_non_empty_l1_[bucket_index_div_64]) {
      cache_buckets_non_empty_l2_ref |= cache_buckets_non_empty_l2_bit;
    } else {
      cache_buckets_non_empty_l2_ref &= ~cache_buckets_non_empty_l2_bit;
    }
  }
  // cache_buckets_non_empty_l1_ (along with cache_buckets_non_empty_l2_, which
  // must be kept in sync) used for indication whether each entry is non-empty,
  // for faster clearing (there's no special index here for an empty entry).
  // Huge, so it's the last in the class.
  // Modified by both the processor and the invalidation callback.
  size_t cache_bucket_first_entries_[kCacheBucketCount];
  static std::pair<uint32_t, uint32_t> MemoryInvalidationCallbackThunk(
      void* context_ptr, uint32_t physical_address_start, uint32_t length,
      bool exact_range);
};

}  // namespace gpu
}  // namespace xe

#endif  // XENIA_GPU_PRIMITIVE_PROCESSOR_H_