xenia/src/xenia/gpu/dxbc_shader_translator.h

/**
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
 * Copyright 2018 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */

#ifndef XENIA_GPU_DXBC_SHADER_TRANSLATOR_H_
#define XENIA_GPU_DXBC_SHADER_TRANSLATOR_H_

#include <cstring>
#include <string>
#include <vector>

#include "xenia/base/math.h"
#include "xenia/base/string_buffer.h"
#include "xenia/gpu/shader_translator.h"

namespace xe {
namespace gpu {

// Generates shader model 5_1 byte code (for Direct3D 12).
class DxbcShaderTranslator : public ShaderTranslator {
 public:
  DxbcShaderTranslator(bool edram_rov_used);
  ~DxbcShaderTranslator() override;

  // Constant buffer bindings in space 0.
  enum class CbufferRegister {
    kSystemConstants,
    kFloatConstants,
    kBoolLoopConstants,
    kFetchConstants,
  };

  enum : uint32_t {
    kSysFlag_XYDividedByW_Shift,
    kSysFlag_ZDividedByW_Shift,
    kSysFlag_WNotReciprocal_Shift,
    kSysFlag_ReverseZ_Shift,
    kSysFlag_DepthStencil_Shift,
    kSysFlag_DepthFloat24_Shift,
    // Depth/stencil testing not done if DepthStencilRead is disabled, but
    // writing may still be done.
    kSysFlag_DepthPassIfLess_Shift,
    kSysFlag_DepthPassIfEqual_Shift,
    kSysFlag_DepthPassIfGreater_Shift,
    // 1 to write new depth to the depth buffer, 0 to keep the old one if the
    // depth test passes.
    kSysFlag_DepthWriteMask_Shift,
    kSysFlag_StencilTest_Shift,
    // This doesn't include depth/stencil masks - only reflects the fact that
    // the new value must be written.
    kSysFlag_DepthStencilWrite_Shift,
    kSysFlag_Color0Gamma_Shift,
    kSysFlag_Color1Gamma_Shift,
    kSysFlag_Color2Gamma_Shift,
    kSysFlag_Color3Gamma_Shift,

    kSysFlag_XYDividedByW = 1u << kSysFlag_XYDividedByW_Shift,
    kSysFlag_ZDividedByW = 1u << kSysFlag_ZDividedByW_Shift,
    kSysFlag_WNotReciprocal = 1u << kSysFlag_WNotReciprocal_Shift,
    kSysFlag_ReverseZ = 1u << kSysFlag_ReverseZ_Shift,
    kSysFlag_DepthStencil = 1u << kSysFlag_DepthStencil_Shift,
    kSysFlag_DepthFloat24 = 1u << kSysFlag_DepthFloat24_Shift,
    kSysFlag_DepthPassIfLess = 1u << kSysFlag_DepthPassIfLess_Shift,
    kSysFlag_DepthPassIfEqual = 1u << kSysFlag_DepthPassIfEqual_Shift,
    kSysFlag_DepthPassIfGreater = 1u << kSysFlag_DepthPassIfGreater_Shift,
    kSysFlag_DepthWriteMask = 1u << kSysFlag_DepthWriteMask_Shift,
    kSysFlag_StencilTest = 1u << kSysFlag_StencilTest_Shift,
    kSysFlag_DepthStencilWrite = 1u << kSysFlag_DepthStencilWrite_Shift,
    kSysFlag_Color0Gamma = 1u << kSysFlag_Color0Gamma_Shift,
    kSysFlag_Color1Gamma = 1u << kSysFlag_Color1Gamma_Shift,
    kSysFlag_Color2Gamma = 1u << kSysFlag_Color2Gamma_Shift,
    kSysFlag_Color3Gamma = 1u << kSysFlag_Color3Gamma_Shift,
  };

  enum : uint32_t {
    kStencilOp_Flag_CurrentMask_Shift,
    // 0, 1 or 3 expanded to 0 or 1 or 0xFF - the value to add.
    kStencilOp_Flag_Add_Shift,
    kStencilOp_Flag_Saturate_Shift = kStencilOp_Flag_Add_Shift + 2,
    kStencilOp_Flag_Invert_Shift,
    kStencilOp_Flag_NewMask_Shift,

    kStencilOp_Flag_CurrentMask = 1u << kStencilOp_Flag_CurrentMask_Shift,
    kStencilOp_Flag_Increment = 1u << kStencilOp_Flag_Add_Shift,
    kStencilOp_Flag_Decrement = 3u << kStencilOp_Flag_Add_Shift,
    kStencilOp_Flag_Saturate = 1u << kStencilOp_Flag_Saturate_Shift,
    kStencilOp_Flag_Invert = 1u << kStencilOp_Flag_Invert_Shift,
    kStencilOp_Flag_NewMask = 1u << kStencilOp_Flag_NewMask_Shift,

    kStencilOp_Keep = kStencilOp_Flag_CurrentMask,
    kStencilOp_Zero = 0,
    kStencilOp_Replace = kStencilOp_Flag_NewMask,
    kStencilOp_IncrementSaturate = kStencilOp_Flag_CurrentMask |
                                   kStencilOp_Flag_Increment |
                                   kStencilOp_Flag_Saturate,
    kStencilOp_DecrementSaturate = kStencilOp_Flag_CurrentMask |
                                   kStencilOp_Flag_Decrement |
                                   kStencilOp_Flag_Saturate,
    kStencilOp_Invert = kStencilOp_Flag_CurrentMask | kStencilOp_Flag_Invert,
    kStencilOp_Increment =
        kStencilOp_Flag_CurrentMask | kStencilOp_Flag_Increment,
    kStencilOp_Decrement =
        kStencilOp_Flag_CurrentMask | kStencilOp_Flag_Decrement,
  };

  enum : uint32_t {
    // Whether the render target needs to be merged with another (if the write
    // mask is not 1111, or 11 for 16_16, or 1 for 32_FLOAT, or blending is
    // enabled and it's not no-op).
    kRTFlag_WriteR_Shift,
    kRTFlag_WriteG_Shift,
    kRTFlag_WriteB_Shift,
    kRTFlag_WriteA_Shift,
    kRTFlag_Blend_Shift,
    // Whether the component does not exist in the render target format.
    kRTFlag_FormatUnusedR_Shift,
    kRTFlag_FormatUnusedG_Shift,
    kRTFlag_FormatUnusedB_Shift,
    kRTFlag_FormatUnusedA_Shift,
    // Whether the format is fixed-point and needs to be converted to integer
    // (k_8_8_8_8, k_2_10_10_10, k_16_16, k_16_16_16_16).
    kRTFlag_FormatFixed_Shift,
    // Whether the format is k_2_10_10_10_FLOAT and 7e3 conversion is needed.
    kRTFlag_FormatFloat10_Shift,
    // Whether the format is k_16_16_FLOAT or k_16_16_16_16_FLOAT and
    // f16tof32/f32tof16 is needed.
    kRTFlag_FormatFloat16_Shift,

    kRTFlag_WriteR = 1u << kRTFlag_WriteR_Shift,
    kRTFlag_WriteG = 1u << kRTFlag_WriteG_Shift,
    kRTFlag_WriteB = 1u << kRTFlag_WriteB_Shift,
    kRTFlag_WriteA = 1u << kRTFlag_WriteA_Shift,
    kRTFlag_Blend = 1u << kRTFlag_Blend_Shift,
    kRTFlag_FormatUnusedR = 1u << kRTFlag_FormatUnusedR_Shift,
    kRTFlag_FormatUnusedG = 1u << kRTFlag_FormatUnusedG_Shift,
    kRTFlag_FormatUnusedB = 1u << kRTFlag_FormatUnusedB_Shift,
    kRTFlag_FormatUnusedA = 1u << kRTFlag_FormatUnusedA_Shift,
    kRTFlag_FormatFixed = 1u << kRTFlag_FormatFixed_Shift,
    kRTFlag_FormatFloat10 = 1u << kRTFlag_FormatFloat10_Shift,
    kRTFlag_FormatFloat16 = 1u << kRTFlag_FormatFloat16_Shift,
  };

  enum : uint32_t {
    // X/Z of the blend constant for the render target.

    // For ONE_MINUS modes, enable both One and the needed factor with _Neg.
    kBlendX_Src_One_Shift = 0,
    kBlendX_Src_One = 1u << kBlendX_Src_One_Shift,
    kBlendX_Src_SrcColor_Shift = 1,
    kBlendX_Src_SrcColor_Pos = 1u << kBlendX_Src_SrcColor_Shift,
    kBlendX_Src_SrcColor_Neg = 3u << kBlendX_Src_SrcColor_Shift,
    kBlendX_Src_SrcAlpha_Shift = 3,
    kBlendX_Src_SrcAlpha_Pos = 1u << kBlendX_Src_SrcAlpha_Shift,
    kBlendX_Src_SrcAlpha_Neg = 3u << kBlendX_Src_SrcAlpha_Shift,
    kBlendX_Src_DestColor_Shift = 5,
    kBlendX_Src_DestColor_Pos = 1u << kBlendX_Src_DestColor_Shift,
    kBlendX_Src_DestColor_Neg = 3u << kBlendX_Src_DestColor_Shift,
    kBlendX_Src_DestAlpha_Shift = 7,
    kBlendX_Src_DestAlpha_Pos = 1u << kBlendX_Src_DestAlpha_Shift,
    kBlendX_Src_DestAlpha_Neg = 3u << kBlendX_Src_DestAlpha_Shift,
    kBlendX_Src_SrcAlphaSaturate_Shift = 9,
    kBlendX_Src_SrcAlphaSaturate = 1u << kBlendX_Src_SrcAlphaSaturate_Shift,

    kBlendX_SrcAlpha_One_Shift = 10,
    kBlendX_SrcAlpha_One = 1u << kBlendX_SrcAlpha_One_Shift,
    kBlendX_SrcAlpha_SrcAlpha_Shift = 11,
    kBlendX_SrcAlpha_SrcAlpha_Pos = 1u << kBlendX_SrcAlpha_SrcAlpha_Shift,
    kBlendX_SrcAlpha_SrcAlpha_Neg = 3u << kBlendX_SrcAlpha_SrcAlpha_Shift,
    kBlendX_SrcAlpha_DestAlpha_Shift = 13,
    kBlendX_SrcAlpha_DestAlpha_Pos = 1u << kBlendX_SrcAlpha_DestAlpha_Shift,
    kBlendX_SrcAlpha_DestAlpha_Neg = 3u << kBlendX_SrcAlpha_DestAlpha_Shift,

    // For ONE_MINUS modes, enable both One and the needed factor with _Neg.
    kBlendX_Dest_One_Shift = 15,
    kBlendX_Dest_One = 1u << kBlendX_Dest_One_Shift,
    kBlendX_Dest_SrcColor_Shift = 16,
    kBlendX_Dest_SrcColor_Pos = 1u << kBlendX_Dest_SrcColor_Shift,
    kBlendX_Dest_SrcColor_Neg = 3u << kBlendX_Dest_SrcColor_Shift,
    kBlendX_Dest_SrcAlpha_Shift = 18,
    kBlendX_Dest_SrcAlpha_Pos = 1u << kBlendX_Dest_SrcAlpha_Shift,
    kBlendX_Dest_SrcAlpha_Neg = 3u << kBlendX_Dest_SrcAlpha_Shift,
    kBlendX_Dest_DestColor_Shift = 20,
    kBlendX_Dest_DestColor_Pos = 1u << kBlendX_Dest_DestColor_Shift,
    kBlendX_Dest_DestColor_Neg = 3u << kBlendX_Dest_DestColor_Shift,
    kBlendX_Dest_DestAlpha_Shift = 22,
    kBlendX_Dest_DestAlpha_Pos = 1u << kBlendX_Dest_DestAlpha_Shift,
    kBlendX_Dest_DestAlpha_Neg = 3u << kBlendX_Dest_DestAlpha_Shift,
    kBlendX_Dest_SrcAlphaSaturate_Shift = 24,
    kBlendX_Dest_SrcAlphaSaturate = 1u << kBlendX_Dest_SrcAlphaSaturate_Shift,

    kBlendX_DestAlpha_One_Shift = 25,
    kBlendX_DestAlpha_One = 1u << kBlendX_DestAlpha_One_Shift,
    kBlendX_DestAlpha_SrcAlpha_Shift = 26,
    kBlendX_DestAlpha_SrcAlpha_Pos = 1u << kBlendX_DestAlpha_SrcAlpha_Shift,
    kBlendX_DestAlpha_SrcAlpha_Neg = 3u << kBlendX_DestAlpha_SrcAlpha_Shift,
    kBlendX_DestAlpha_DestAlpha_Shift = 28,
    kBlendX_DestAlpha_DestAlpha_Pos = 1u << kBlendX_DestAlpha_DestAlpha_Shift,
    kBlendX_DestAlpha_DestAlpha_Neg = 3u << kBlendX_DestAlpha_DestAlpha_Shift,

    // Y/W of the blend constant for the render target.

    kBlendY_Src_ConstantColor_Shift = 0,
    kBlendY_Src_ConstantColor_Pos = 1u << kBlendY_Src_ConstantColor_Shift,
    kBlendY_Src_ConstantColor_Neg = 3u << kBlendY_Src_ConstantColor_Shift,
    kBlendY_Src_ConstantAlpha_Shift = 2,
    kBlendY_Src_ConstantAlpha_Pos = 1u << kBlendY_Src_ConstantAlpha_Shift,
    kBlendY_Src_ConstantAlpha_Neg = 3u << kBlendY_Src_ConstantAlpha_Shift,

    kBlendY_SrcAlpha_ConstantAlpha_Shift = 4,
    kBlendY_SrcAlpha_ConstantAlpha_Pos =
        1u << kBlendY_SrcAlpha_ConstantAlpha_Shift,
    kBlendY_SrcAlpha_ConstantAlpha_Neg =
        3u << kBlendY_SrcAlpha_ConstantAlpha_Shift,

    kBlendY_Dest_ConstantColor_Shift = 6,
    kBlendY_Dest_ConstantColor_Pos = 1u << kBlendY_Dest_ConstantColor_Shift,
    kBlendY_Dest_ConstantColor_Neg = 3u << kBlendY_Dest_ConstantColor_Shift,
    kBlendY_Dest_ConstantAlpha_Shift = 8,
    kBlendY_Dest_ConstantAlpha_Pos = 1u << kBlendY_Dest_ConstantAlpha_Shift,
    kBlendY_Dest_ConstantAlpha_Neg = 3u << kBlendY_Dest_ConstantAlpha_Shift,

    kBlendY_DestAlpha_ConstantAlpha_Shift = 10,
    kBlendY_DestAlpha_ConstantAlpha_Pos =
        1u << kBlendY_DestAlpha_ConstantAlpha_Shift,
    kBlendY_DestAlpha_ConstantAlpha_Neg =
        3u << kBlendY_DestAlpha_ConstantAlpha_Shift,

    // For addition/subtraction/inverse subtraction, but must be positive for
    // min/max.
    kBlendY_Src_OpSign_Shift = 12,
    kBlendY_Src_OpSign_Pos = 1u << kBlendY_Src_OpSign_Shift,
    kBlendY_Src_OpSign_Neg = 3u << kBlendY_Src_OpSign_Shift,
    kBlendY_SrcAlpha_OpSign_Shift = 14,
    kBlendY_SrcAlpha_OpSign_Pos = 1u << kBlendY_SrcAlpha_OpSign_Shift,
    kBlendY_SrcAlpha_OpSign_Neg = 3u << kBlendY_SrcAlpha_OpSign_Shift,
    kBlendY_Dest_OpSign_Shift = 16,
    kBlendY_Dest_OpSign_Pos = 1u << kBlendY_Dest_OpSign_Shift,
    kBlendY_Dest_OpSign_Neg = 3u << kBlendY_Dest_OpSign_Shift,
    kBlendY_DestAlpha_OpSign_Shift = 18,
    kBlendY_DestAlpha_OpSign_Pos = 1u << kBlendY_DestAlpha_OpSign_Shift,
    kBlendY_DestAlpha_OpSign_Neg = 3u << kBlendY_DestAlpha_OpSign_Shift,

    kBlendY_Color_OpMin_Shift = 20,
    kBlendY_Color_OpMin = 1u << kBlendY_Color_OpMin_Shift,
    kBlendY_Color_OpMax_Shift = 21,
    kBlendY_Color_OpMax = 1u << kBlendY_Color_OpMax_Shift,
    kBlendY_Alpha_OpMin_Shift = 22,
    kBlendY_Alpha_OpMin = 1u << kBlendY_Alpha_OpMin_Shift,
    kBlendY_Alpha_OpMax_Shift = 23,
    kBlendY_Alpha_OpMax = 1u << kBlendY_Alpha_OpMax_Shift,
  };

  // IF SYSTEM CONSTANTS ARE CHANGED OR ADDED, THE FOLLOWING MUST BE UPDATED:
  // - kSysConst enum (indices, registers and first components).
  // - system_constant_rdef_.
  // - d3d12/shaders/xenos_draw.hlsli (for geometry shaders).
  struct SystemConstants {
    // vec4 0
    uint32_t flags;
    uint32_t vertex_index_endian;
    uint32_t vertex_base_index;
    uint32_t pixel_pos_reg;

    // vec4 1
    float ndc_scale[3];
    float pixel_half_pixel_offset;

    // vec4 2
    float ndc_offset[3];
    // 0 - disabled, 1 - passes if in range, -1 - fails if in range.
    int32_t alpha_test;

    // vec4 3
    float point_size[2];
    float point_size_min_max[2];

    // vec3 4
    // Inverse scale of the host viewport (but not supersampled), with signs
    // pre-applied.
    float point_screen_to_ndc[2];
    float ssaa_inv_scale[2];

    // vec4 5
    // The range is floats as uints so it's easier to pass infinity.
    uint32_t alpha_test_range[2];
    uint32_t edram_pitch_tiles;
    uint32_t edram_depth_base_dwords;

    // vec4 6
    float color_exp_bias[4];

    // vec4 7
    uint32_t color_output_map[4];

    // vec4 8
    uint32_t edram_stencil_reference;
    uint32_t edram_stencil_read_mask;
    uint32_t edram_stencil_write_mask;
    uint32_t padding_8;

    // vec4 9
    union {
      struct {
        // kStencilOp, separated into sub-operations - not the Xenos enum.
        uint32_t edram_stencil_front_fail;
        uint32_t edram_stencil_front_depth_fail;
        uint32_t edram_stencil_front_pass;
        uint32_t edram_stencil_front_comparison;
      };
      uint32_t edram_stencil_front[4];
    };

    // vec4 10
    union {
      struct {
        // kStencilOp, separated into sub-operations - not the Xenos enum.
        uint32_t edram_stencil_back_fail;
        uint32_t edram_stencil_back_depth_fail;
        uint32_t edram_stencil_back_pass;
        uint32_t edram_stencil_back_comparison;
      };
      uint32_t edram_stencil_back[4];
    };

    // vec4 11
    uint32_t edram_base_dwords[4];

    // vec4 12
    // Binding and format info flags.
    uint32_t edram_rt_flags[4];

    // vec4 13
    // Format info - widths of components in the lower 32 bits (for ibfe/bfi),
    // packed as 8:8:8:8 for each render target.
    uint32_t edram_rt_pack_width_low[4];

    // vec4 14
    // Format info - offsets of components in the lower 32 bits (for ibfe/bfi),
    // packed as 8:8:8:8 for each render target.
    uint32_t edram_rt_pack_offset_low[4];

    // vec4 15
    // Format info - widths of components in the upper 32 bits (for ibfe/bfi),
    // packed as 8:8:8:8 for each render target.
    uint32_t edram_rt_pack_width_high[4];

    // vec4 16
    // Format info - offsets of components in the upper 32 bits (for ibfe/bfi),
    // packed as 8:8:8:8 for each render target.
    uint32_t edram_rt_pack_offset_high[4];

    // vec4 17:18
    // Format info - mask of color and alpha after unpacking, but before float
    // conversion. Primarily to differentiate between signed and unsigned
    // formats because ibfe is used for both since k_16_16 and k_16_16_16_16 are
    // signed.
    uint32_t edram_load_mask_rt01_rt23[2][4];

    // vec4 19:20
    // Format info - scale to apply to the color and the alpha of each render
    // target after unpacking and converting.
    float edram_load_scale_rt01_rt23[2][4];

    // vec4 21:22
    // Render target blending options.
    uint32_t edram_blend_rt01_rt23[2][4];

    // vec4 23
    // The constant blend factor for the respective modes.
    float edram_blend_constant[4];

    // vec4 24:25
    // Format info - minimum color and alpha values (as float, before
    // conversion) writable to the each render target. Integer so it's easier to
    // write infinity.
    uint32_t edram_store_min_rt01_rt23[2][4];

    // vec4 26:27
    // Format info - maximum color and alpha values (as float, before
    // conversion) writable to the each render target. Integer so it's easier to
    // write infinity.
    uint32_t edram_store_max_rt01_rt23[2][4];

    // vec4 28:29
    // Format info - scale to apply to the color and the alpha of each render
    // target before converting and packing.
    float edram_store_scale_rt01_rt23[2][4];
  };

  // 192 textures at most because there are 32 fetch constants, and textures can
  // be 2D array, 3D or cube, and also signed and unsigned.
  static constexpr uint32_t kMaxTextureSRVIndexBits = 8;
  static constexpr uint32_t kMaxTextureSRVs =
      (1 << kMaxTextureSRVIndexBits) - 1;
  struct TextureSRV {
    uint32_t fetch_constant;
    TextureDimension dimension;
    bool is_signed;
    // Whether this SRV must be bound even if it's signed and all components are
    // unsigned and vice versa (for kGetTextureComputedLod).
    bool is_sign_required;
    std::string name;
  };
  // The first binding returned is at t1 because t0 is shared memory.
  const TextureSRV* GetTextureSRVs(uint32_t& count_out) const {
    count_out = uint32_t(texture_srvs_.size());
    return texture_srvs_.data();
  }

  // Arbitrary limit - there can't be more than 2048 in a shader-visible
  // descriptor heap, though some older hardware (tier 1 resource binding -
  // Nvidia Fermi) doesn't support more than 16 samplers bound at once (we can't
  // really do anything if a game uses more than 16), but just to have some
  // limit so sampler count can easily be packed into 32-bit map keys (for
  // instance, for root signatures). But shaders can specify overrides for
  // filtering modes, and the number of possible combinations is huge - let's
  // limit it to something sane.
  static constexpr uint32_t kMaxSamplerBindingIndexBits = 7;
  static constexpr uint32_t kMaxSamplerBindings =
      (1 << kMaxSamplerBindingIndexBits) - 1;
  struct SamplerBinding {
    uint32_t fetch_constant;
    TextureFilter mag_filter;
    TextureFilter min_filter;
    TextureFilter mip_filter;
    AnisoFilter aniso_filter;
    std::string name;
  };
  const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const {
    count_out = uint32_t(sampler_bindings_.size());
    return sampler_bindings_.data();
  }

  // Returns the bits that need to be added to the RT flags constant - needs to
  // be done externally, not in SetColorFormatConstants, because the flags
  // contain other state.
  static uint32_t GetColorFormatRTFlags(ColorRenderTargetFormat format);
  static void SetColorFormatSystemConstants(SystemConstants& constants,
                                            uint32_t rt_index,
                                            ColorRenderTargetFormat format);
  // Returns whether blending should be done at all (not 1 * src + 0 * dest).
  static bool GetBlendConstants(uint32_t blend_control, uint32_t& blend_x_out,
                                uint32_t& blend_y_out);

  // Creates a special pixel shader without color outputs - this resets the
  // state of the translator.
  std::vector<uint8_t> CreateDepthOnlyPixelShader();

 protected:
  void Reset() override;

  void StartTranslation() override;

  std::vector<uint8_t> CompleteTranslation() override;

  void ProcessLabel(uint32_t cf_index) override;

  void ProcessExecInstructionBegin(const ParsedExecInstruction& instr) override;
  void ProcessExecInstructionEnd(const ParsedExecInstruction& instr) override;
  void ProcessLoopStartInstruction(
      const ParsedLoopStartInstruction& instr) override;
  void ProcessLoopEndInstruction(
      const ParsedLoopEndInstruction& instr) override;
  void ProcessJumpInstruction(const ParsedJumpInstruction& instr) override;

  void ProcessVertexFetchInstruction(
      const ParsedVertexFetchInstruction& instr) override;
  void ProcessTextureFetchInstruction(
      const ParsedTextureFetchInstruction& instr) override;
  void ProcessAluInstruction(const ParsedAluInstruction& instr) override;

 private:
  enum : uint32_t {
    kSysConst_Flags_Index = 0,
    kSysConst_Flags_Vec = 0,
    kSysConst_Flags_Comp = 0,
    kSysConst_VertexIndexEndian_Index = kSysConst_Flags_Index + 1,
    kSysConst_VertexIndexEndian_Vec = kSysConst_Flags_Vec,
    kSysConst_VertexIndexEndian_Comp = 1,
    kSysConst_VertexBaseIndex_Index = kSysConst_VertexIndexEndian_Index + 1,
    kSysConst_VertexBaseIndex_Vec = kSysConst_Flags_Vec,
    kSysConst_VertexBaseIndex_Comp = 2,
    kSysConst_PixelPosReg_Index = kSysConst_VertexBaseIndex_Index + 1,
    kSysConst_PixelPosReg_Vec = kSysConst_Flags_Vec,
    kSysConst_PixelPosReg_Comp = 3,

    kSysConst_NDCScale_Index = kSysConst_PixelPosReg_Index + 1,
    kSysConst_NDCScale_Vec = kSysConst_Flags_Vec + 1,
    kSysConst_NDCScale_Comp = 0,
    kSysConst_PixelHalfPixelOffset_Index = kSysConst_NDCScale_Index + 1,
    kSysConst_PixelHalfPixelOffset_Vec = kSysConst_NDCScale_Vec,
    kSysConst_PixelHalfPixelOffset_Comp = 3,

    kSysConst_NDCOffset_Index = kSysConst_PixelHalfPixelOffset_Index + 1,
    kSysConst_NDCOffset_Vec = kSysConst_NDCScale_Vec + 1,
    kSysConst_NDCOffset_Comp = 0,
    kSysConst_AlphaTest_Index = kSysConst_NDCOffset_Index + 1,
    kSysConst_AlphaTest_Vec = kSysConst_NDCOffset_Vec,
    kSysConst_AlphaTest_Comp = 3,

    kSysConst_PointSize_Index = kSysConst_AlphaTest_Index + 1,
    kSysConst_PointSize_Vec = kSysConst_NDCOffset_Vec + 1,
    kSysConst_PointSize_Comp = 0,
    kSysConst_PointSizeMinMax_Index = kSysConst_PointSize_Index + 1,
    kSysConst_PointSizeMinMax_Vec = kSysConst_PointSize_Vec,
    kSysConst_PointSizeMinMax_Comp = 2,

    kSysConst_PointScreenToNDC_Index = kSysConst_PointSizeMinMax_Index + 1,
    kSysConst_PointScreenToNDC_Vec = kSysConst_PointSizeMinMax_Vec + 1,
    kSysConst_PointScreenToNDC_Comp = 0,
    kSysConst_SSAAInvScale_Index = kSysConst_PointScreenToNDC_Index + 1,
    kSysConst_SSAAInvScale_Vec = kSysConst_PointScreenToNDC_Vec,
    kSysConst_SSAAInvScale_Comp = 2,

    kSysConst_AlphaTestRange_Index = kSysConst_SSAAInvScale_Index + 1,
    kSysConst_AlphaTestRange_Vec = kSysConst_SSAAInvScale_Vec + 1,
    kSysConst_AlphaTestRange_Comp = 0,
    kSysConst_EDRAMPitchTiles_Index = kSysConst_AlphaTestRange_Index + 1,
    kSysConst_EDRAMPitchTiles_Vec = kSysConst_AlphaTestRange_Vec,
    kSysConst_EDRAMPitchTiles_Comp = 2,
    kSysConst_EDRAMDepthBaseDwords_Index = kSysConst_EDRAMPitchTiles_Index + 1,
    kSysConst_EDRAMDepthBaseDwords_Vec = kSysConst_AlphaTestRange_Vec,
    kSysConst_EDRAMDepthBaseDwords_Comp = 3,

    kSysConst_ColorExpBias_Index = kSysConst_EDRAMDepthBaseDwords_Index + 1,
    kSysConst_ColorExpBias_Vec = kSysConst_EDRAMDepthBaseDwords_Vec + 1,

    kSysConst_ColorOutputMap_Index = kSysConst_ColorExpBias_Index + 1,
    kSysConst_ColorOutputMap_Vec = kSysConst_ColorExpBias_Vec + 1,

    kSysConst_EDRAMStencilReference_Index = kSysConst_ColorOutputMap_Index + 1,
    kSysConst_EDRAMStencilReference_Vec = kSysConst_ColorOutputMap_Vec + 1,
    kSysConst_EDRAMStencilReference_Comp = 0,
    kSysConst_EDRAMStencilReadMask_Index =
        kSysConst_EDRAMStencilReference_Index + 1,
    kSysConst_EDRAMStencilReadMask_Vec = kSysConst_EDRAMStencilReference_Vec,
    kSysConst_EDRAMStencilReadMask_Comp = 1,
    kSysConst_EDRAMStencilWriteMask_Index =
        kSysConst_EDRAMStencilReadMask_Index + 1,
    kSysConst_EDRAMStencilWriteMask_Vec = kSysConst_EDRAMStencilReference_Vec,
    kSysConst_EDRAMStencilWriteMask_Comp = 2,

    kSysConst_EDRAMStencilFront_Index =
        kSysConst_EDRAMStencilWriteMask_Index + 1,
    kSysConst_EDRAMStencilFront_Vec = kSysConst_EDRAMStencilWriteMask_Vec + 1,

    kSysConst_EDRAMStencilBack_Index = kSysConst_EDRAMStencilFront_Index + 1,
    kSysConst_EDRAMStencilBack_Vec = kSysConst_EDRAMStencilFront_Vec + 1,

    // Components of stencil front and back.
    kSysConst_EDRAMStencilSide_Fail_Comp = 0,
    kSysConst_EDRAMStencilSide_DepthFail_Comp = 1,
    kSysConst_EDRAMStencilSide_Pass_Comp = 2,
    kSysConst_EDRAMStencilSide_Comparison_Comp = 3,

    kSysConst_EDRAMBaseDwords_Index = kSysConst_EDRAMStencilBack_Index + 1,
    kSysConst_EDRAMBaseDwords_Vec = kSysConst_EDRAMStencilBack_Vec + 1,

    kSysConst_EDRAMRTFlags_Index = kSysConst_EDRAMBaseDwords_Index + 1,
    kSysConst_EDRAMRTFlags_Vec = kSysConst_EDRAMBaseDwords_Vec + 1,

    kSysConst_EDRAMRTPackWidthLow_Index = kSysConst_EDRAMRTFlags_Index + 1,
    kSysConst_EDRAMRTPackWidthLow_Vec = kSysConst_EDRAMRTFlags_Vec + 1,

    kSysConst_EDRAMRTPackOffsetLow_Index =
        kSysConst_EDRAMRTPackWidthLow_Index + 1,
    kSysConst_EDRAMRTPackOffsetLow_Vec = kSysConst_EDRAMRTPackWidthLow_Vec + 1,

    kSysConst_EDRAMRTPackWidthHigh_Index =
        kSysConst_EDRAMRTPackOffsetLow_Index + 1,
    kSysConst_EDRAMRTPackWidthHigh_Vec = kSysConst_EDRAMRTPackOffsetLow_Vec + 1,

    kSysConst_EDRAMRTPackOffsetHigh_Index =
        kSysConst_EDRAMRTPackWidthHigh_Index + 1,
    kSysConst_EDRAMRTPackOffsetHigh_Vec =
        kSysConst_EDRAMRTPackWidthHigh_Vec + 1,

    kSysConst_EDRAMLoadMaskRT01_Index =
        kSysConst_EDRAMRTPackOffsetHigh_Index + 1,
    kSysConst_EDRAMLoadMaskRT01_Vec = kSysConst_EDRAMRTPackOffsetHigh_Vec + 1,

    kSysConst_EDRAMLoadMaskRT23_Index = kSysConst_EDRAMLoadMaskRT01_Index + 1,
    kSysConst_EDRAMLoadMaskRT23_Vec = kSysConst_EDRAMLoadMaskRT01_Vec + 1,

    kSysConst_EDRAMLoadScaleRT01_Index = kSysConst_EDRAMLoadMaskRT23_Index + 1,
    kSysConst_EDRAMLoadScaleRT01_Vec = kSysConst_EDRAMLoadMaskRT23_Vec + 1,

    kSysConst_EDRAMLoadScaleRT23_Index = kSysConst_EDRAMLoadScaleRT01_Index + 1,
    kSysConst_EDRAMLoadScaleRT23_Vec = kSysConst_EDRAMLoadScaleRT01_Vec + 1,

    kSysConst_EDRAMBlendRT01_Index = kSysConst_EDRAMLoadScaleRT23_Index + 1,
    kSysConst_EDRAMBlendRT01_Vec = kSysConst_EDRAMLoadScaleRT23_Vec + 1,

    kSysConst_EDRAMBlendRT23_Index = kSysConst_EDRAMBlendRT01_Index + 1,
    kSysConst_EDRAMBlendRT23_Vec = kSysConst_EDRAMBlendRT01_Vec + 1,

    kSysConst_EDRAMBlendConstant_Index = kSysConst_EDRAMBlendRT23_Index + 1,
    kSysConst_EDRAMBlendConstant_Vec = kSysConst_EDRAMBlendRT23_Vec + 1,

    kSysConst_EDRAMStoreMinRT01_Index = kSysConst_EDRAMBlendConstant_Index + 1,
    kSysConst_EDRAMStoreMinRT01_Vec = kSysConst_EDRAMBlendConstant_Vec + 1,

    kSysConst_EDRAMStoreMinRT23_Index = kSysConst_EDRAMStoreMinRT01_Index + 1,
    kSysConst_EDRAMStoreMinRT23_Vec = kSysConst_EDRAMStoreMinRT01_Vec + 1,

    kSysConst_EDRAMStoreMaxRT01_Index = kSysConst_EDRAMStoreMinRT23_Index + 1,
    kSysConst_EDRAMStoreMaxRT01_Vec = kSysConst_EDRAMStoreMinRT23_Vec + 1,

    kSysConst_EDRAMStoreMaxRT23_Index = kSysConst_EDRAMStoreMaxRT01_Index + 1,
    kSysConst_EDRAMStoreMaxRT23_Vec = kSysConst_EDRAMStoreMaxRT01_Vec + 1,

    kSysConst_EDRAMStoreScaleRT01_Index = kSysConst_EDRAMStoreMaxRT23_Index + 1,
    kSysConst_EDRAMStoreScaleRT01_Vec = kSysConst_EDRAMStoreMaxRT23_Vec + 1,

    kSysConst_EDRAMStoreScaleRT23_Index =
        kSysConst_EDRAMStoreScaleRT01_Index + 1,
    kSysConst_EDRAMStoreScaleRT23_Vec = kSysConst_EDRAMStoreScaleRT01_Vec + 1,

    kSysConst_Count = kSysConst_EDRAMStoreScaleRT23_Index + 1
  };

  static constexpr uint32_t kInterpolatorCount = 16;
  static constexpr uint32_t kPointParametersTexCoord = kInterpolatorCount;

  enum class InOutRegister : uint32_t {
    // IF ANY OF THESE ARE CHANGED, WriteInputSignature and WriteOutputSignature
    // MUST BE UPDATED!
    kVSInVertexIndex = 0,

    kVSOutInterpolators = 0,
    kVSOutPointParameters = kVSOutInterpolators + kInterpolatorCount,
    kVSOutPosition,

    kPSInInterpolators = 0,
    kPSInPointParameters = kPSInInterpolators + kInterpolatorCount,
    kPSInPosition,
    kPSInFrontFace,
  };

  static constexpr uint32_t kSwizzleXYZW = 0b11100100;
  static constexpr uint32_t kSwizzleXXXX = 0b00000000;
  static constexpr uint32_t kSwizzleYYYY = 0b01010101;
  static constexpr uint32_t kSwizzleZZZZ = 0b10101010;
  static constexpr uint32_t kSwizzleWWWW = 0b11111111;

  // Operand encoding, with 32-bit immediate indices by default. None of the
  // arguments must be shifted when calling.
  static constexpr uint32_t EncodeScalarOperand(
      uint32_t type, uint32_t index_dimension,
      uint32_t index_representation_0 = 0, uint32_t index_representation_1 = 0,
      uint32_t index_representation_2 = 0) {
    // D3D10_SB_OPERAND_1_COMPONENT.
    return 1 | (type << 12) | (index_dimension << 20) |
           (index_representation_0 << 22) | (index_representation_1 << 25) |
           (index_representation_0 << 28);
  }
  // For writing to vectors. Mask literal can be written as 0bWZYX.
  static constexpr uint32_t EncodeVectorMaskedOperand(
      uint32_t type, uint32_t mask, uint32_t index_dimension,
      uint32_t index_representation_0 = 0, uint32_t index_representation_1 = 0,
      uint32_t index_representation_2 = 0) {
    // D3D10_SB_OPERAND_4_COMPONENT, D3D10_SB_OPERAND_4_COMPONENT_MASK_MODE.
    return 2 | (0 << 2) | (mask << 4) | (type << 12) | (index_dimension << 20) |
           (index_representation_0 << 22) | (index_representation_1 << 25) |
           (index_representation_2 << 28);
  }
  // For reading from vectors. Swizzle can be written as 0bWWZZYYXX.
  static constexpr uint32_t EncodeVectorSwizzledOperand(
      uint32_t type, uint32_t swizzle, uint32_t index_dimension,
      uint32_t index_representation_0 = 0, uint32_t index_representation_1 = 0,
      uint32_t index_representation_2 = 0) {
    // D3D10_SB_OPERAND_4_COMPONENT, D3D10_SB_OPERAND_4_COMPONENT_SWIZZLE_MODE.
    return 2 | (1 << 2) | (swizzle << 4) | (type << 12) |
           (index_dimension << 20) | (index_representation_0 << 22) |
           (index_representation_1 << 25) | (index_representation_2 << 28);
  }
  // For reading a single component of a vector as a 4-component vector.
  static constexpr uint32_t EncodeVectorReplicatedOperand(
      uint32_t type, uint32_t component, uint32_t index_dimension,
      uint32_t index_representation_0 = 0, uint32_t index_representation_1 = 0,
      uint32_t index_representation_2 = 0) {
    // D3D10_SB_OPERAND_4_COMPONENT, D3D10_SB_OPERAND_4_COMPONENT_SWIZZLE_MODE.
    return 2 | (1 << 2) | (component << 4) | (component << 6) |
           (component << 8) | (component << 10) | (type << 12) |
           (index_dimension << 20) | (index_representation_0 << 22) |
           (index_representation_1 << 25) | (index_representation_2 << 28);
  }
  // For reading scalars from vectors.
  static constexpr uint32_t EncodeVectorSelectOperand(
      uint32_t type, uint32_t component, uint32_t index_dimension,
      uint32_t index_representation_0 = 0, uint32_t index_representation_1 = 0,
      uint32_t index_representation_2 = 0) {
    // D3D10_SB_OPERAND_4_COMPONENT, D3D10_SB_OPERAND_4_COMPONENT_SELECT_1_MODE.
    return 2 | (2 << 2) | (component << 4) | (type << 12) |
           (index_dimension << 20) | (index_representation_0 << 22) |
           (index_representation_1 << 25) | (index_representation_2 << 28);
  }

  // Use these instead of is_vertex_shader/is_pixel_shader because they don't
  // take is_depth_only_pixel_shader_ into account.
  inline bool IsDXBCVertexShader() const {
    return !is_depth_only_pixel_shader_ && is_vertex_shader();
  }
  inline bool IsDXBCPixelShader() const {
    return is_depth_only_pixel_shader_ || is_pixel_shader();
  }

  // Allocates a new r# register for internal use and returns its index.
  uint32_t PushSystemTemp(bool zero = false);
  // Frees the last allocated internal r# registers for later reuse.
  void PopSystemTemp(uint32_t count = 1);

  // Whether general-purpose register values should be stored in x0 rather than
  // r# in this shader.
  bool IndexableGPRsUsed() const;

  // Writing the prologue.
  void StartVertexShader_LoadVertexIndex();
  void StartVertexShader();
  void StartPixelShader();

  // Writing the epilogue.
  void CompleteVertexShader();
  // Converts the depth in system_temp_depth_.x to 24-bit unorm or float,
  // depending on the flag value. Uses system_temp_depth_.yz as scratch - w not
  // touched.
  void CompletePixelShader_DepthTo24Bit();
  // This just converts the color output value from/to gamma space, not checking
  // any conditions.
  void CompletePixelShader_GammaCorrect(uint32_t color_temp, bool to_gamma);
  void CompletePixelShader_WriteToRTVs();
  // Extracts widths and offsets of the components in the lower or the upper
  // dword of a pixel from the format constants, for use as ibfe and bfi
  // operands later.
  void CompletePixelShader_WriteToROV_ExtractPackLayout(uint32_t rt_index,
                                                        bool high,
                                                        uint32_t width_temp,
                                                        uint32_t offset_temp);
  // Components of rt_format_flags_temp.
  enum : uint32_t {
    kROVRTFormatFlagTemp_ColorFixed,
    kROVRTFormatFlagTemp_AlphaFixed,
    kROVRTFormatFlagTemp_Float10,
    kROVRTFormatFlagTemp_Float16,

    kROVRTFormatFlagTemp_Fixed_Swizzle =
        kROVRTFormatFlagTemp_ColorFixed * 0b00010101 +
        kROVRTFormatFlagTemp_AlphaFixed * 0b01000000,
  };
  void CompletePixelShader_WriteToROV_LoadColor(
      uint32_t edram_dword_offset_low_temp,
      uint32_t edram_dword_offset_high_temp, uint32_t rt_index,
      uint32_t rt_format_flags_temp, uint32_t target_temp);
  // Clamps the color to the range representable by the render target's format.
  // Will also remove NaN since min and max return the non-NaN value.
  // color_in_temp and color_out_temp may be the same.
  void CompletePixelShader_WriteToROV_ClampColor(uint32_t rt_index,
                                                 uint32_t color_in_temp,
                                                 uint32_t color_out_temp);
  // Extracts 0.0 or plus/minus 1.0 from a blend constant. For example, it can
  // be used to extract one scale for color and alpha into XY, and another scale
  // for color and alpha into ZW. constant_swizzle is a bit mask indicating
  // which part of the blend constant for the render target to extract the scale
  // from, 0b00000000 for X/Z only, 0b01010101 for Y/W only, 0b00000001 for X/Z
  // in the first component, Y/W in the rest (XY changed to ZW automatically
  // according to the render target index - don't set the higher bit).
  void CompletePixelShader_WriteToROV_ExtractBlendScales(
      uint32_t rt_index, uint32_t constant_swizzle, bool is_signed,
      uint32_t shift_x, uint32_t shift_y, uint32_t shift_z, uint32_t shift_w,
      uint32_t target_temp, uint32_t write_mask = 0b1111);
  void CompletePixelShader_WriteToROV_ApplyZeroBlendScale(
      uint32_t scale_temp, uint32_t scale_swizzle, uint32_t factor_in_temp,
      uint32_t factor_swizzle, uint32_t factor_out_temp,
      uint32_t write_mask = 0b1111);
  void CompletePixelShader_WriteToROV_Blend(uint32_t rt_index,
                                            uint32_t rt_format_flags_temp,
                                            uint32_t src_color_and_output_temp,
                                            uint32_t dest_color_temp);
  // Assumes the incoming color is already clamped to the range representable by
  // the RT format.
  void CompletePixelShader_WriteToROV_StoreColor(
      uint32_t edram_dword_offset_low_temp,
      uint32_t edram_dword_offset_high_temp, uint32_t rt_index,
      uint32_t rt_format_flags_temp, uint32_t source_and_scratch_temp);
  void CompletePixelShader_WriteToROV();
  void CompletePixelShader();
  void CompleteShaderCode();

  // Writes the original instruction disassembly in the output DXBC if enabled,
  // as shader messages, from instruction_disassembly_buffer_.
  void EmitInstructionDisassembly();

  // Abstract 4-component vector source operand.
  struct DxbcSourceOperand {
    enum class Type {
      // GPR number in the index - used only when GPRs are not dynamically
      // indexed in the shader and there are no constant zeros and ones in the
      // swizzle.
      kRegister,
      // Immediate: float constant vector number in the index.
      // Dynamic: intermediate X contains page number, intermediate Y contains
      // vector number in the page.
      kConstantFloat,
      // The whole value preloaded to the intermediate register - used for GPRs
      // when they are indexable, for bool/loop constants pre-converted to
      // float, and for other operands if their swizzle contains 0 or 1.
      kIntermediateRegister,
      // Literal vector of zeros and positive or negative ones - when the
      // swizzle contains only them, or when the parsed operand is invalid (for
      // example, if it's a fetch constant in a non-tfetch texture instruction).
      // 0 or 1 specified in the index as bits, can be negated.
      kZerosOnes,
    };

    Type type;
    uint32_t index;
    // If the operand is dynamically indexed directly when it's used as an
    // operand in DXBC instructions.
    InstructionStorageAddressingMode addressing_mode;

    uint32_t swizzle;
    bool is_negated;
    bool is_absolute_value;

    // Temporary register containing data required to access the value if it has
    // to be accessed in multiple operations (allocated with PushSystemTemp).
    uint32_t intermediate_register;
    static constexpr uint32_t kIntermediateRegisterNone = UINT32_MAX;
  };
  // Each Load must be followed by Unload, otherwise there may be a temporary
  // register leak.
  void LoadDxbcSourceOperand(const InstructionOperand& operand,
                             DxbcSourceOperand& dxbc_operand);
  // Number of tokens this operand adds to the instruction length when used.
  uint32_t DxbcSourceOperandLength(const DxbcSourceOperand& operand,
                                   bool negate = false,
                                   bool absolute = false) const;
  // Writes the operand access tokens to the instruction (either for a scalar if
  // select_component is <= 3, or for a vector).
  void UseDxbcSourceOperand(const DxbcSourceOperand& operand,
                            uint32_t additional_swizzle = kSwizzleXYZW,
                            uint32_t select_component = 4, bool negate = false,
                            bool absolute = false);
  void UnloadDxbcSourceOperand(const DxbcSourceOperand& operand);

  // Writes xyzw or xxxx of the specified r# to the destination.
  void StoreResult(const InstructionResult& result, uint32_t reg,
                   bool replicate_x);

  // The nesting of `if` instructions is the following:
  // - pc checks (labels).
  // - exec predicate/bool constant check.
  // - Instruction-level predicate checks.
  // As an optimization, where possible, the DXBC translator tries to merge
  // multiple execs into one, not creating endif/if doing nothing, if the
  // execution condition is the same. This can't be done across labels
  // (obviously) and in case `setp` is done in a predicated exec - in this case,
  // the predicate value in the current exec may not match the predicate value
  // in the next exec.
  // Instruction-level predicate checks are also merged, and until a `setp` is
  // done, if the instruction has the same predicate condition as the exec it is
  // in, no instruction-level predicate `if` is created as well. One exception
  // to the usual way of instruction-level predicate handling is made for
  // instructions involving derivative computation, such as texture fetches with
  // computed LOD. The part involving derivatives is executed disregarding the
  // predication, but the result storing is predicated (this is handled in
  // texture fetch instruction implementation):
  // https://docs.microsoft.com/en-us/windows/desktop/direct3dhlsl/dx9-graphics-reference-asm-ps-registers-output-color

  // Updates the current flow control condition (to be called in the beginning
  // of exec and in jumps), closing the previous conditionals if needed.
  // However, if the condition is not different, the instruction-level predicate
  // `if` also won't be closed - this must be checked separately if needed (for
  // example, in jumps). If emit_disassembly is true, this function emits the
  // last disassembly written to instruction_disassembly_buffer_ after closing
  // the previous conditional and before opening a new one.
  void UpdateExecConditionals(ParsedExecInstruction::Type type,
                              uint32_t bool_constant_index, bool condition,
                              bool emit_disassembly);
  // Closes `if`s opened by exec and instructions within them (but not by
  // labels) and updates the state accordingly.
  void CloseExecConditionals();
  // Opens or reopens the predicate check conditional for the instruction. If
  // emit_disassembly is true, this function emits the last disassembly written
  // to instruction_disassembly_buffer_ after closing the previous predicate
  // conditional and before opening a new one.
  void UpdateInstructionPredication(bool predicated, bool condition,
                                    bool emit_disassembly);
  // Closes the instruction-level predicate `if` if it's open, useful if a flow
  // control instruction needs to do some code which needs to respect the exec's
  // conditional, but can't itself be predicated.
  void CloseInstructionPredication();
  void JumpToLabel(uint32_t address);

  // Emits copde for endian swapping of the data located in pv.
  void SwapVertexData(uint32_t vfetch_index, uint32_t write_mask);

  // Returns T#/t# index (they are the same in this translator).
  uint32_t FindOrAddTextureSRV(uint32_t fetch_constant,
                               TextureDimension dimension, bool is_signed,
                               bool is_sign_required = false);
  // Returns S#/s# index (they are the same in this translator).
  uint32_t FindOrAddSamplerBinding(uint32_t fetch_constant,
                                   TextureFilter mag_filter,
                                   TextureFilter min_filter,
                                   TextureFilter mip_filter,
                                   AnisoFilter aniso_filter);
  // Converts (S, T, face index) in the specified temporary register to a 3D
  // cubemap coordinate.
  void ArrayCoordToCubeDirection(uint32_t reg);

  void ProcessVectorAluInstruction(const ParsedAluInstruction& instr);
  void ProcessScalarAluInstruction(const ParsedAluInstruction& instr);

  // Appends a string to a DWORD stream, returns the DWORD-aligned length.
  static uint32_t AppendString(std::vector<uint32_t>& dest, const char* source);
  // Returns the length of a string as if it was appended to a DWORD stream, in
  // bytes.
  static inline uint32_t GetStringLength(const char* source) {
    return uint32_t(xe::align(std::strlen(source) + 1, sizeof(uint32_t)));
  }

  void WriteResourceDefinitions();
  void WriteInputSignature();
  void WriteOutputSignature();
  void WriteShaderCode();

  // Executable instructions - generated during translation.
  std::vector<uint32_t> shader_code_;
  // Complete shader object, with all the needed chunks and dcl_ instructions -
  // generated in the end of translation.
  std::vector<uint32_t> shader_object_;

  // Buffer for instruction disassembly comments.
  StringBuffer instruction_disassembly_buffer_;

  // Whether the output merger should be emulated in pixel shaders.
  bool edram_rov_used_;

  // Is currently writing the empty depth-only pixel shader, for
  // CompleteTranslation.
  bool is_depth_only_pixel_shader_;

  // Data types used in constants buffers. Listed in dependency order.
  enum class RdefTypeIndex {
    kFloat,
    kFloat2,
    kFloat3,
    kFloat4,
    kInt,
    kUint,
    kUint4,
    // Float constants - size written dynamically.
    kFloat4ConstantArray,
    // Bool constants.
    kUint4Array8,
    // Loop constants.
    kUint4Array32,
    // Fetch constants.
    kUint4Array48,

    kCount,
    kUnknown = kCount
  };

  struct RdefStructMember {
    const char* name;
    RdefTypeIndex type;
    uint32_t offset;
  };

  struct RdefType {
    // Name ignored for arrays.
    const char* name;
    // D3D10_SHADER_VARIABLE_CLASS.
    uint32_t type_class;
    // D3D10_SHADER_VARIABLE_TYPE.
    uint32_t type;
    uint32_t row_count;
    uint32_t column_count;
    // 0 for primitive types, 1 for structures, array size for arrays.
    uint32_t element_count;
    uint32_t struct_member_count;
    RdefTypeIndex array_element_type;
    const RdefStructMember* struct_members;
  };
  static const RdefType rdef_types_[size_t(RdefTypeIndex::kCount)];

  // Number of constant buffer bindings used in this shader - also used for
  // generation of indices of constant buffers that are optional.
  uint32_t cbuffer_count_;
  static constexpr uint32_t kCbufferIndexUnallocated = UINT32_MAX;
  uint32_t cbuffer_index_system_constants_;
  uint32_t cbuffer_index_float_constants_;
  uint32_t cbuffer_index_bool_loop_constants_;
  uint32_t cbuffer_index_fetch_constants_;

  struct SystemConstantRdef {
    const char* name;
    RdefTypeIndex type;
    uint32_t offset;
    uint32_t size;
  };
  static const SystemConstantRdef system_constant_rdef_[kSysConst_Count];
  // Mask of system constants (1 << kSysConst_#_Index) used in the shader, so
  // the remaining ones can be marked as unused in RDEF.
  uint64_t system_constants_used_;

  // Whether constants are dynamically indexed and need to be marked as such in
  // dcl_constantBuffer.
  bool float_constants_dynamic_indexed_;
  bool bool_loop_constants_dynamic_indexed_;

  // Offsets of float constant indices in shader_code_, for remapping in
  // CompleteTranslation (initially, at these offsets, guest float constant
  // indices are written).
  std::vector<uint32_t> float_constant_index_offsets_;

  // Number of currently allocated Xenia internal r# registers.
  uint32_t system_temp_count_current_;
  // Total maximum number of temporary registers ever used during this
  // translation (for the declaration).
  uint32_t system_temp_count_max_;

  // Vector ALU result/scratch (since Xenos write masks can contain swizzles).
  uint32_t system_temp_pv_;
  // Temporary register ID for previous scalar result, program counter,
  // predicate and absolute address register.
  uint32_t system_temp_ps_pc_p0_a0_;
  // Loop index stack - .x is the active loop, shifted right to .yzw on push.
  uint32_t system_temp_aL_;
  // Loop counter stack, .x is the active loop. Represents number of times
  // remaining to loop.
  uint32_t system_temp_loop_count_;
  // Explicitly set texture gradients and LOD.
  uint32_t system_temp_grad_h_lod_;
  uint32_t system_temp_grad_v_;

  // Position in vertex shaders (because viewport and W transformations can be
  // applied in the end of the shader).
  uint32_t system_temp_position_;

  // Color outputs in pixel shaders (because of exponent bias, alpha test and
  // remapping).
  uint32_t system_temp_color_[4];
  // Whether the color output has been written in the execution path (ROV only).
  uint32_t system_temp_color_written_;
  // Depth output in pixel shader, and 3 dwords usable as scratch for operations
  // related to depth. Currently only used for ROV depth.
  // TODO(Triang3l): Reduce depth to 24-bit in pixel shaders when using a DSV
  // for accuracy.
  uint32_t system_temp_depth_;

  // The bool constant number containing the condition for the currently
  // processed exec (or the last - unless a label has reset this), or
  // kCfExecBoolConstantNone if it's not checked.
  uint32_t cf_exec_bool_constant_;
  static constexpr uint32_t kCfExecBoolConstantNone = UINT32_MAX;
  // The expected bool constant value in the current exec if
  // cf_exec_bool_constant_ is not kCfExecBoolConstantNone.
  bool cf_exec_bool_constant_condition_;
  // Whether the currently processed exec is executed if a predicate is
  // set/unset.
  bool cf_exec_predicated_;
  // The expected predicated condition if cf_exec_predicated_ is true.
  bool cf_exec_predicate_condition_;
  // Whether an `if` for instruction-level predicate check is currently open.
  bool cf_instruction_predicate_if_open_;
  // The expected predicate condition for the current or the last instruction if
  // cf_exec_instruction_predicated_ is true.
  bool cf_instruction_predicate_condition_;
  // Whether there was a `setp` in the current exec before the current
  // instruction, thus instruction-level predicate value can be different than
  // the exec-level predicate value, and can't merge two execs with the same
  // predicate condition anymore.
  bool cf_exec_predicate_written_;

  bool writes_depth_;

  std::vector<TextureSRV> texture_srvs_;
  std::vector<SamplerBinding> sampler_bindings_;

  // The STAT chunk (based on Wine d3dcompiler_parse_stat).
  struct Statistics {
    uint32_t instruction_count;
    uint32_t temp_register_count;
    // Unknown in Wine.
    uint32_t def_count;
    // Only inputs and outputs.
    uint32_t dcl_count;
    uint32_t float_instruction_count;
    uint32_t int_instruction_count;
    uint32_t uint_instruction_count;
    // endif, ret.
    uint32_t static_flow_control_count;
    // if (but not else).
    uint32_t dynamic_flow_control_count;
    // Unknown in Wine.
    uint32_t macro_instruction_count;
    uint32_t temp_array_count;
    uint32_t array_instruction_count;
    uint32_t cut_instruction_count;
    uint32_t emit_instruction_count;
    uint32_t texture_normal_instructions;
    uint32_t texture_load_instructions;
    uint32_t texture_comp_instructions;
    uint32_t texture_bias_instructions;
    uint32_t texture_gradient_instructions;
    // Not including indexable temp load/store.
    uint32_t mov_instruction_count;
    // Unknown in Wine.
    uint32_t movc_instruction_count;
    uint32_t conversion_instruction_count;
    // Unknown in Wine.
    uint32_t unknown_22;
    uint32_t input_primitive;
    uint32_t gs_output_topology;
    uint32_t gs_max_output_vertex_count;
    uint32_t unknown_26;
    // Unknown in Wine, but confirmed by testing.
    uint32_t lod_instructions;
    uint32_t unknown_28;
    uint32_t unknown_29;
    uint32_t c_control_points;
    uint32_t hs_output_primitive;
    uint32_t hs_partitioning;
    uint32_t tessellator_domain;
    // Unknown in Wine.
    uint32_t c_barrier_instructions;
    // Unknown in Wine.
    uint32_t c_interlocked_instructions;
    // Unknown in Wine, but confirmed by testing.
    uint32_t c_texture_store_instructions;
  };
  Statistics stat_;
};

}  // namespace gpu
}  // namespace xe

#endif  // XENIA_GPU_DXBC_SHADER_TRANSLATOR_H_