/*
 * Geforce NV2A PGRAPH GLSL Shader Generator
 *
 * Copyright (c) 2014 Jannik Vogel
 * Copyright (c) 2012 espes
 *
 * Based on:
 * Cxbx, VertexShader.cpp
 * Copyright (c) 2004 Aaron Robinson <caustik@caustik.com>
 *                    Kingofc <kingofc@freenet.de>
 * Dxbx, uPushBuffer.pas
 * Copyright (c) 2007 Shadow_tj, PatrickvL
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 or
 * (at your option) version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
 */

#include "qemu/osdep.h"

#include <stdio.h>
#include <string.h>
#include <stdbool.h>
#include <assert.h>

#include "hw/xbox/nv2a/pgraph/vsh.h"
#include "common.h"
#include "vsh-prog.h"

#define VSH_D3DSCM_CORRECTION 96


typedef enum {
    PARAM_UNKNOWN = 0,
    PARAM_R,
    PARAM_V,
    PARAM_C
} VshParameterType;

typedef enum {
    OUTPUT_C = 0,
    OUTPUT_O
} VshOutputType;

typedef enum {
    OMUX_MAC = 0,
    OMUX_ILU
} VshOutputMux;

typedef enum {
    ILU_NOP = 0,
    ILU_MOV,
    ILU_RCP,
    ILU_RCC,
    ILU_RSQ,
    ILU_EXP,
    ILU_LOG,
    ILU_LIT
} VshILU;

typedef enum {
    MAC_NOP,
    MAC_MOV,
    MAC_MUL,
    MAC_ADD,
    MAC_MAD,
    MAC_DP3,
    MAC_DPH,
    MAC_DP4,
    MAC_DST,
    MAC_MIN,
    MAC_MAX,
    MAC_SLT,
    MAC_SGE,
    MAC_ARL
} VshMAC;

typedef enum {
    SWIZZLE_X = 0,
    SWIZZLE_Y,
    SWIZZLE_Z,
    SWIZZLE_W
} VshSwizzle;


typedef struct VshFieldMapping {
    VshFieldName field_name;
    uint8_t subtoken;
    uint8_t start_bit;
    uint8_t bit_length;
} VshFieldMapping;

static const VshFieldMapping field_mapping[] = {
    // Field Name         DWORD BitPos BitSize
    {  FLD_ILU,              1,   25,     3 },
    {  FLD_MAC,              1,   21,     4 },
    {  FLD_CONST,            1,   13,     8 },
    {  FLD_V,                1,    9,     4 },
    // INPUT A
    {  FLD_A_NEG,            1,    8,     1 },
    {  FLD_A_SWZ_X,          1,    6,     2 },
    {  FLD_A_SWZ_Y,          1,    4,     2 },
    {  FLD_A_SWZ_Z,          1,    2,     2 },
    {  FLD_A_SWZ_W,          1,    0,     2 },
    {  FLD_A_R,              2,   28,     4 },
    {  FLD_A_MUX,            2,   26,     2 },
    // INPUT B
    {  FLD_B_NEG,            2,   25,     1 },
    {  FLD_B_SWZ_X,          2,   23,     2 },
    {  FLD_B_SWZ_Y,          2,   21,     2 },
    {  FLD_B_SWZ_Z,          2,   19,     2 },
    {  FLD_B_SWZ_W,          2,   17,     2 },
    {  FLD_B_R,              2,   13,     4 },
    {  FLD_B_MUX,            2,   11,     2 },
    // INPUT C
    {  FLD_C_NEG,            2,   10,     1 },
    {  FLD_C_SWZ_X,          2,    8,     2 },
    {  FLD_C_SWZ_Y,          2,    6,     2 },
    {  FLD_C_SWZ_Z,          2,    4,     2 },
    {  FLD_C_SWZ_W,          2,    2,     2 },
    {  FLD_C_R_HIGH,         2,    0,     2 },
    {  FLD_C_R_LOW,          3,   30,     2 },
    {  FLD_C_MUX,            3,   28,     2 },
    // Output
    {  FLD_OUT_MAC_MASK,     3,   24,     4 },
    {  FLD_OUT_R,            3,   20,     4 },
    {  FLD_OUT_ILU_MASK,     3,   16,     4 },
    {  FLD_OUT_O_MASK,       3,   12,     4 },
    {  FLD_OUT_ORB,          3,   11,     1 },
    {  FLD_OUT_ADDRESS,      3,    3,     8 },
    {  FLD_OUT_MUX,          3,    2,     1 },
    // Other
    {  FLD_A0X,              3,    1,     1 },
    {  FLD_FINAL,            3,    0,     1 }
};


typedef struct VshOpcodeParams {
    bool A;
    bool B;
    bool C;
} VshOpcodeParams;

#if 0
static const VshOpcodeParams ilu_opcode_params[] = {
    /* ILU OP       ParamA ParamB ParamC */
    /* ILU_NOP */ { false, false, false }, // Dxbx note : Unused
    /* ILU_MOV */ { false, false, true  },
    /* ILU_RCP */ { false, false, true  },
    /* ILU_RCC */ { false, false, true  },
    /* ILU_RSQ */ { false, false, true  },
    /* ILU_EXP */ { false, false, true  },
    /* ILU_LOG */ { false, false, true  },
    /* ILU_LIT */ { false, false, true  },
};
#endif

static const VshOpcodeParams mac_opcode_params[] = {
    /* MAC OP      ParamA  ParamB ParamC */
    /* MAC_NOP */ { false, false, false }, // Dxbx note : Unused
    /* MAC_MOV */ { true,  false, false },
    /* MAC_MUL */ { true,  true,  false },
    /* MAC_ADD */ { true,  false, true  },
    /* MAC_MAD */ { true,  true,  true  },
    /* MAC_DP3 */ { true,  true,  false },
    /* MAC_DPH */ { true,  true,  false },
    /* MAC_DP4 */ { true,  true,  false },
    /* MAC_DST */ { true,  true,  false },
    /* MAC_MIN */ { true,  true,  false },
    /* MAC_MAX */ { true,  true,  false },
    /* MAC_SLT */ { true,  true,  false },
    /* MAC_SGE */ { true,  true,  false },
    /* MAC_ARL */ { true,  false, false },
};


static const char* mask_str[] = {
            // xyzw xyzw
    ",",     // 0000 ____
    ",w",   // 0001 ___w
    ",z",   // 0010 __z_
    ",zw",  // 0011 __zw
    ",y",   // 0100 _y__
    ",yw",  // 0101 _y_w
    ",yz",  // 0110 _yz_
    ",yzw", // 0111 _yzw
    ",x",   // 1000 x___
    ",xw",  // 1001 x__w
    ",xz",  // 1010 x_z_
    ",xzw", // 1011 x_zw
    ",xy",  // 1100 xy__
    ",xyw", // 1101 xy_w
    ",xyz", // 1110 xyz_
    ",xyzw" // 1111 xyzw
};

/* Writes to the oFog register apply the most significant masked component to
 * `x`. The remaining values are assigned arbitrarily to fit the 4-component
 * function behavior. */
static const char* fog_mask_str[] = {
    ",",
    ",x",
    ",x",
    ",xy",
    ",x",
    ",xy",
    ",xy",
    ",xyz",
    ",x",
    ",xy",
    ",xy",
    ",xyz",
    ",xy",
    ",xyz",
    ",xyz",
    ",xyzw"
};

/* Note: OpenGL seems to be case-sensitive, and requires upper-case opcodes! */
static const char* mac_opcode[] = {
    "NOP",
    "MOV",
    "MUL",
    "ADD",
    "MAD",
    "DP3",
    "DPH",
    "DP4",
    "DST",
    "MIN",
    "MAX",
    "SLT",
    "SGE",
    "ARL A0.x", // Dxbx note : Alias for "mov a0.x"
};

static const char* ilu_opcode[] = {
    "NOP",
    "MOV",
    "RCP",
    "RCC",
    "RSQ",
    "EXP",
    "LOG",
    "LIT",
};

static bool ilu_force_scalar[] = {
    false,
    false,
    true,
    true,
    true,
    true,
    true,
    false,
};

#define OUTPUT_REG_FOG 5

static const char* out_reg_name[] = {
    "oPos",
    "???",
    "???",
    "oD0",
    "oD1",
    "oFog",
    "oPts",
    "oB0",
    "oB1",
    "oT0",
    "oT1",
    "oT2",
    "oT3",
    "???",
    "???",
    "A0.x",
};


// Retrieves a number of bits in the instruction token
static int vsh_get_from_token(const uint32_t *shader_token,
                              uint8_t subtoken,
                              uint8_t start_bit,
                              uint8_t bit_length)
{
    return (shader_token[subtoken] >> start_bit) & ~(0xFFFFFFFF << bit_length);
}

uint8_t vsh_get_field(const uint32_t *shader_token, VshFieldName field_name)
{

    return (uint8_t)(vsh_get_from_token(shader_token,
                                        field_mapping[field_name].subtoken,
                                        field_mapping[field_name].start_bit,
                                        field_mapping[field_name].bit_length));
}


// Converts the C register address to disassembly format
static int16_t convert_c_register(const int16_t c_reg)
{
    int16_t r = ((((c_reg >> 5) & 7) - 3) * 32) + (c_reg & 31);
    r += VSH_D3DSCM_CORRECTION; /* to map -96..95 to 0..191 */
    return r; //FIXME: = c_reg?!
}


static MString* decode_swizzle(const uint32_t *shader_token,
                               VshFieldName swizzle_field)
{
    const char* swizzle_str = "xyzw";
    VshSwizzle x, y, z, w;

    /* some microcode instructions force a scalar value */
    if (swizzle_field == FLD_C_SWZ_X
        && ilu_force_scalar[vsh_get_field(shader_token, FLD_ILU)]) {
        x = y = z = w = vsh_get_field(shader_token, swizzle_field);
    } else {
        x = vsh_get_field(shader_token, swizzle_field++);
        y = vsh_get_field(shader_token, swizzle_field++);
        z = vsh_get_field(shader_token, swizzle_field++);
        w = vsh_get_field(shader_token, swizzle_field);
    }

    if (x == SWIZZLE_X && y == SWIZZLE_Y
        && z == SWIZZLE_Z && w == SWIZZLE_W) {
        /* Don't print the swizzle if it's .xyzw */
        return mstring_from_str(""); // Will turn ".xyzw" into "."
    /* Don't print duplicates */
    } else if (x == y && y == z && z == w) {
        return mstring_from_str((char[]){'.', swizzle_str[x], '\0'});
    } else if (y == z && z == w) {
        return mstring_from_str((char[]){'.',
            swizzle_str[x], swizzle_str[y], '\0'});
    } else if (z == w) {
        return mstring_from_str((char[]){'.',
            swizzle_str[x], swizzle_str[y], swizzle_str[z], '\0'});
    } else {
        return mstring_from_str((char[]){'.',
                                       swizzle_str[x], swizzle_str[y],
                                       swizzle_str[z], swizzle_str[w],
                                       '\0'}); // Normal swizzle mask
    }
}

static MString* decode_opcode_input(const uint32_t *shader_token,
                                    VshParameterType param,
                                    VshFieldName neg_field,
                                    int reg_num)
{
    /* This function decodes a vertex shader opcode parameter into a string.
     * Input A, B or C is controlled via the Param and NEG fieldnames,
     * the R-register address for each input is already given by caller. */

    MString *ret_str = mstring_new();


    if (vsh_get_field(shader_token, neg_field) > 0) {
        mstring_append_fmt(ret_str, "-");
    }

    /* PARAM_R uses the supplied reg_num, but the other two need to be
     * determined */
    char tmp[40];
    switch (param) {
    case PARAM_R:
        snprintf(tmp, sizeof(tmp), "R%d", reg_num);
        break;
    case PARAM_V:
        reg_num = vsh_get_field(shader_token, FLD_V);
        snprintf(tmp, sizeof(tmp), "v%d", reg_num);
        break;
    case PARAM_C:
        reg_num = convert_c_register(vsh_get_field(shader_token, FLD_CONST));
        if (vsh_get_field(shader_token, FLD_A0X) > 0) {
            //FIXME: does this really require the "correction" doe in convert_c_register?!
            snprintf(tmp, sizeof(tmp), "c[A0+%d]", reg_num);
        } else {
            snprintf(tmp, sizeof(tmp), "c[%d]", reg_num);
        }
        break;
    default:
        fprintf(stderr, "Unknown vs param: 0x%x\n", param);
        assert(false);
        break;
    }
    mstring_append(ret_str, tmp);

    {
        /* swizzle bits are next to the neg bit */
        MString *swizzle_str = decode_swizzle(shader_token, neg_field+1);
        mstring_append(ret_str, mstring_get_str(swizzle_str));
        mstring_unref(swizzle_str);
    }

    return ret_str;
}


static MString* decode_opcode(const uint32_t *shader_token,
                              VshOutputMux out_mux,
                              uint32_t mask,
                              const char *opcode,
                              const char *inputs,
                              MString** suffix)
{
    MString *ret = mstring_new();
    int reg_num = vsh_get_field(shader_token, FLD_OUT_R);
    bool use_temp_var = false;

    /* Test for paired opcodes (in other words : Are both <> NOP?) */
    if (out_mux == OMUX_MAC
          && vsh_get_field(shader_token, FLD_ILU) != ILU_NOP) {
        use_temp_var = true;
        if (reg_num == 1) {
            /* Ignore paired MAC opcodes that write to R1 */
            mask = 0;
        }
    } else if (out_mux == OMUX_ILU
               && vsh_get_field(shader_token, FLD_MAC) != MAC_NOP) {
        /* Paired ILU opcodes can only write to R1 */
        reg_num = 1;
    }

    /* See if we must add a muxed opcode too: */
    if (vsh_get_field(shader_token, FLD_OUT_MUX) == out_mux
        /* Only if it's not masked away: */
        && vsh_get_field(shader_token, FLD_OUT_O_MASK) != 0) {

        mstring_append(ret, "  ");
        mstring_append(ret, opcode);
        mstring_append(ret, "(");

        bool write_fog_register = false;
        if (vsh_get_field(shader_token, FLD_OUT_ORB) == OUTPUT_C) {
            assert(!"TODO: Emulate writeable const registers");
            mstring_append_fmt(ret, "c%d",
                               convert_c_register(vsh_get_field(
                                   shader_token, FLD_OUT_ADDRESS)));
        } else {
            int out_reg = vsh_get_field(shader_token, FLD_OUT_ADDRESS) & 0xF;
            mstring_append(ret,out_reg_name[out_reg]);
            write_fog_register = out_reg == OUTPUT_REG_FOG;
        }

        int write_mask = vsh_get_field(shader_token, FLD_OUT_O_MASK);
        const char *write_mask_str = write_fog_register ? fog_mask_str[write_mask] : mask_str[write_mask];
        mstring_append(ret, write_mask_str);
        mstring_append(ret, inputs);
        mstring_append(ret, ");\n");
    }

    if (use_temp_var) {
        assert(suffix && "Temp var flagged on non-MAC instruction");
        *suffix = mstring_new();
        if (strcmp(opcode, mac_opcode[MAC_ARL]) == 0) {
            mstring_append_fmt(ret, "  ARL(_temp_addr%s);\n", inputs);
            mstring_append(*suffix, "  A0 = _temp_addr;\n");
        } else if (mask > 0) {
            mstring_append_fmt(ret, "  %s(_temp_vec%s%s);\n",
                               opcode, mask_str[mask], inputs);

            // Skip the leading comma
            const char *mask_components = &mask_str[mask][1];
            if (mask_components[0]) {
                mstring_append_fmt(*suffix,
                                   "  R%d.%s = _temp_vec.%s;\n",
                                   reg_num,
                                   mask_components,
                                   mask_components);
            } else {
                mstring_append_fmt(*suffix, "  R%d = _temp_vec;\n", reg_num);
            }
        }
    } else {
        if (strcmp(opcode, mac_opcode[MAC_ARL]) == 0) {
            mstring_append_fmt(ret, "  ARL(A0%s);\n", inputs);
        } else if (mask > 0) {
            mstring_append_fmt(ret, "  %s(R%d%s%s);\n",
                               opcode, reg_num, mask_str[mask], inputs);
        }
    }

    return ret;
}


static MString* decode_token(const uint32_t *shader_token)
{
    MString *ret;

    /* See what MAC opcode is written to (if not masked away): */
    VshMAC mac = vsh_get_field(shader_token, FLD_MAC);
    /* See if a ILU opcode is present too: */
    VshILU ilu = vsh_get_field(shader_token, FLD_ILU);
    if (mac == MAC_NOP && ilu == ILU_NOP) {
        return mstring_new();
    }

    /* Since it's potentially used twice, decode input C once: */
    MString *input_c =
        decode_opcode_input(shader_token,
                            vsh_get_field(shader_token, FLD_C_MUX),
                            FLD_C_NEG,
                            (vsh_get_field(shader_token, FLD_C_R_HIGH) << 2)
                                | vsh_get_field(shader_token, FLD_C_R_LOW));

    MString *mac_suffix = NULL;
    if (mac != MAC_NOP) {
        MString *inputs_mac = mstring_new();
        if (mac_opcode_params[mac].A) {
            MString *input_a =
                decode_opcode_input(shader_token,
                                    vsh_get_field(shader_token, FLD_A_MUX),
                                    FLD_A_NEG,
                                    vsh_get_field(shader_token, FLD_A_R));
            mstring_append(inputs_mac, ", ");
            mstring_append(inputs_mac, mstring_get_str(input_a));
            mstring_unref(input_a);
        }
        if (mac_opcode_params[mac].B) {
            MString *input_b =
                decode_opcode_input(shader_token,
                                    vsh_get_field(shader_token, FLD_B_MUX),
                                    FLD_B_NEG,
                                    vsh_get_field(shader_token, FLD_B_R));
            mstring_append(inputs_mac, ", ");
            mstring_append(inputs_mac, mstring_get_str(input_b));
            mstring_unref(input_b);
        }
        if (mac_opcode_params[mac].C) {
            mstring_append(inputs_mac, ", ");
            mstring_append(inputs_mac, mstring_get_str(input_c));
        }

        /* Then prepend these inputs with the actual opcode, mask, and input : */
        ret = decode_opcode(shader_token,
                            OMUX_MAC,
                            vsh_get_field(shader_token, FLD_OUT_MAC_MASK),
                            mac_opcode[mac],
                            mstring_get_str(inputs_mac),
                            &mac_suffix);
        mstring_unref(inputs_mac);
    } else {
        ret = mstring_new();
    }

    if (ilu != ILU_NOP) {
        MString *inputs_c = mstring_from_str(", ");
        mstring_append(inputs_c, mstring_get_str(input_c));

        /* Append the ILU opcode, mask and (the already determined) input C: */
        MString *ilu_op =
            decode_opcode(shader_token,
                          OMUX_ILU,
                          vsh_get_field(shader_token, FLD_OUT_ILU_MASK),
                          ilu_opcode[ilu],
                          mstring_get_str(inputs_c),
                          NULL);

        mstring_append(ret, mstring_get_str(ilu_op));

        mstring_unref(inputs_c);
        mstring_unref(ilu_op);
    }

    mstring_unref(input_c);

    if (mac_suffix) {
        mstring_append(ret, mstring_get_str(mac_suffix));
        mstring_unref(mac_suffix);
    }

    return ret;
}

static const char* vsh_header =
    "\n"
    "int A0 = 0;\n"
    "\n"
    "vec4 R0 = vec4(0.0,0.0,0.0,0.0);\n"
    "vec4 R1 = vec4(0.0,0.0,0.0,0.0);\n"
    "vec4 R2 = vec4(0.0,0.0,0.0,0.0);\n"
    "vec4 R3 = vec4(0.0,0.0,0.0,0.0);\n"
    "vec4 R4 = vec4(0.0,0.0,0.0,0.0);\n"
    "vec4 R5 = vec4(0.0,0.0,0.0,0.0);\n"
    "vec4 R6 = vec4(0.0,0.0,0.0,0.0);\n"
    "vec4 R7 = vec4(0.0,0.0,0.0,0.0);\n"
    "vec4 R8 = vec4(0.0,0.0,0.0,0.0);\n"
    "vec4 R9 = vec4(0.0,0.0,0.0,0.0);\n"
    "vec4 R10 = vec4(0.0,0.0,0.0,0.0);\n"
    "vec4 R11 = vec4(0.0,0.0,0.0,0.0);\n"
    "#define R12 oPos\n" /* R12 is a mirror of oPos */
    "\n"

    /* Used to emulate concurrency of paired MAC+ILU instructions */
    "vec4 _temp_vec;\n"
    "int _temp_addr;\n"

    /* See:
     * http://msdn.microsoft.com/en-us/library/windows/desktop/bb174703%28v=vs.85%29.aspx
     * https://www.opengl.org/registry/specs/NV/vertex_program1_1.txt
     */
    "\n"
//QQQ #ifdef NICE_CODE
    "/* Converts the input to vec4, pads with last component */\n"
    "vec4 _in(float v) { return vec4(v); }\n"
    "vec4 _in(vec2 v) { return v.xyyy; }\n"
    "vec4 _in(vec3 v) { return v.xyzz; }\n"
    "vec4 _in(vec4 v) { return v.xyzw; }\n"
//#else
//    "/* Make sure input is always a vec4 */\n"
//   "#define _in(v) vec4(v)\n"
//#endif
    "\n"
    "#define INFINITY (1.0 / 0.0)\n"
    "\n"
    "#define MOV(dest, mask, src) dest.mask = _MOV(_in(src)).mask\n"
    "vec4 _MOV(vec4 src)\n"
    "{\n"
    "  return src;\n"
    "}\n"
    "\n"
    "#define MUL(dest, mask, src0, src1) dest.mask = _MUL(_in(src0), _in(src1)).mask\n"
    "vec4 _MUL(vec4 src0, vec4 src1)\n"
    "{\n"
    // Unfortunately mix() falls victim to the same handling of exceptional
    // (inf/NaN) handling as a multiply, so per-component comparisons are used
    // to guarantee HW behavior (anything * 0 must == 0).
    "  vec4 zero_components = sign(src0) * sign(src1);\n"
    "  vec4 ret = src0 * src1;\n"
    "  if (zero_components.x == 0.0) { ret.x = 0.0; }\n"
    "  if (zero_components.y == 0.0) { ret.y = 0.0; }\n"
    "  if (zero_components.z == 0.0) { ret.z = 0.0; }\n"
    "  if (zero_components.w == 0.0) { ret.w = 0.0; }\n"
    "  return ret;\n"
    "}\n"
    "\n"
    "#define ADD(dest, mask, src0, src1) dest.mask = _ADD(_in(src0), _in(src1)).mask\n"
    "vec4 _ADD(vec4 src0, vec4 src1)\n"
    "{\n"
    "  return src0 + src1;\n"
    "}\n"
    "\n"
    "#define MAD(dest, mask, src0, src1, src2) dest.mask = _MAD(_in(src0), _in(src1), _in(src2)).mask\n"
    "vec4 _MAD(vec4 src0, vec4 src1, vec4 src2)\n"
    "{\n"
    "  return _MUL(src0, src1) + src2;\n"
    "}\n"
    "\n"
    "#define DP3(dest, mask, src0, src1) dest.mask = _DP3(_in(src0), _in(src1)).mask\n"
    "vec4 _DP3(vec4 src0, vec4 src1)\n"
    "{\n"
    "  return vec4(dot(src0.xyz, src1.xyz));\n"
    "}\n"
    "\n"
    "#define DPH(dest, mask, src0, src1) dest.mask = _DPH(_in(src0), _in(src1)).mask\n"
    "vec4 _DPH(vec4 src0, vec4 src1)\n"
    "{\n"
    "  return vec4(dot(vec4(src0.xyz, 1.0), src1));\n"
    "}\n"
    "\n"
    "#define DP4(dest, mask, src0, src1) dest.mask = _DP4(_in(src0), _in(src1)).mask\n"
    "vec4 _DP4(vec4 src0, vec4 src1)\n"
    "{\n"
    "  return vec4(dot(src0, src1));\n"
    "}\n"
    "\n"
    "#define DST(dest, mask, src0, src1) dest.mask = _DST(_in(src0), _in(src1)).mask\n"
    "vec4 _DST(vec4 src0, vec4 src1)\n"
    "{\n"
    "  return vec4(1.0,\n"
    "              src0.y * src1.y,\n"
    "              src0.z,\n"
    "              src1.w);\n"
    "}\n"
    "\n"
    "#define MIN(dest, mask, src0, src1) dest.mask = _MIN(_in(src0), _in(src1)).mask\n"
    "vec4 _MIN(vec4 src0, vec4 src1)\n"
    "{\n"
    "  return min(src0, src1);\n"
    "}\n"
    "\n"
    "#define MAX(dest, mask, src0, src1) dest.mask = _MAX(_in(src0), _in(src1)).mask\n"
    "vec4 _MAX(vec4 src0, vec4 src1)\n"
    "{\n"
    "  return max(src0, src1);\n"
    "}\n"
    "\n"
    "#define SLT(dest, mask, src0, src1) dest.mask = _SLT(_in(src0), _in(src1)).mask\n"
    "vec4 _SLT(vec4 src0, vec4 src1)\n"
    "{\n"
    "  return vec4(lessThan(src0, src1));\n"
    "}\n"
    "\n"
    "#define ARL(dest, src) dest = _ARL(_in(src).x)\n"
    "int _ARL(float src)\n"
    "{\n"
    "  /* Xbox GPU does specify rounding, OpenGL doesn't; so we need a bias.\n"
    "   * Example: We probably want to floor 16.99.. to 17, not 16.\n"
    "   * Source of error (why we get 16.99.. instead of 17.0) is typically\n"
    "   * vertex-attributes being normalized from a byte value to float:\n"
    "   *   17 / 255 = 0.06666.. so is this 0.06667 (ceil) or 0.06666 (floor)?\n"
    "   * Which value we get depends on the host GPU.\n"
    "   * If we multiply these rounded values by 255 later, we get:\n"
    "   *   17.00 (ARL result = 17) or 16.99 (ARL result = 16).\n"
    "   * We assume the intend was to get 17, so we add our bias to fix it. */\n"
    "  return int(floor(src + 0.001));\n"
    "}\n"
    "\n"
    "#define SGE(dest, mask, src0, src1) dest.mask = _SGE(_in(src0), _in(src1)).mask\n"
    "vec4 _SGE(vec4 src0, vec4 src1)\n"
    "{\n"
    "  return vec4(greaterThanEqual(src0, src1));\n"
    "}\n"
    "\n"
    "#define RCP(dest, mask, src) dest.mask = _RCP(_in(src).x).mask\n"
    "vec4 _RCP(float src)\n"
    "{\n"
    "  return vec4(1.0 / src);\n"
    "}\n"
    "\n"
    "#define RCC(dest, mask, src) dest.mask = _RCC(_in(src).x).mask\n"
    "vec4 _RCC(float src)\n"
    "{\n"
    "  float t = clampAwayZeroInf(1.0 / src);\n"
    "  return vec4(t);\n"
    "}\n"
    "\n"
    "#define RSQ(dest, mask, src) dest.mask = _RSQ(_in(src).x).mask\n"
    "vec4 _RSQ(float src)\n"
    "{\n"
    "  if (src == 0.0) { return vec4(INFINITY); }\n"
    "  if (isinf(src)) { return vec4(0.0); }\n"
    "  return vec4(inversesqrt(abs(src)));\n"
    "}\n"
    "\n"
    "#define EXP(dest, mask, src) dest.mask = _EXP(_in(src).x).mask\n"
    "vec4 _EXP(float src)\n"
    "{\n"
    "  vec4 result;\n"
    "  result.x = exp2(floor(src));\n"
    "  result.y = src - floor(src);\n"
    "  result.z = exp2(src);\n"
    "  result.w = 1.0;\n"
    "  return result;\n"
    "}\n"
    "\n"
    "#define LOG(dest, mask, src) dest.mask = _LOG(_in(src).x).mask\n"
    "vec4 _LOG(float src)\n"
    "{\n"
    "  float tmp = abs(src);\n"
    "  if (tmp == 0.0) { return vec4(-INFINITY, 1.0f, -INFINITY, 1.0f); }\n"
    "  vec4 result;\n"
    "  result.x = floor(log2(tmp));\n"
    "  result.y = tmp / exp2(floor(log2(tmp)));\n"
    "  result.z = log2(tmp);\n"
    "  result.w = 1.0;\n"
    "  return result;\n"
    "}\n"
    "\n"
    "#define LIT(dest, mask, src) dest.mask = _LIT(_in(src)).mask\n"
    "vec4 _LIT(vec4 src)\n"
    "{\n"
    "  vec4 s = src;\n"
    "  float epsilon = 1.0 / 256.0;\n"
    "  s.w = clamp(s.w, -(128.0 - epsilon), 128.0 - epsilon);\n"
    "  s.x = max(s.x, 0.0);\n"
    "  s.y = max(s.y, 0.0);\n"
    "  vec4 t = vec4(1.0, 0.0, 0.0, 1.0);\n"
    "  t.y = s.x;\n"
#if 1
    "  t.z = (s.x > 0.0) ? exp2(s.w * log2(s.y)) : 0.0;\n"
#else
    "  t.z = (s.x > 0.0) ? pow(s.y, s.w) : 0.0;\n"
#endif
    "  return t;\n"
    "}\n";

void pgraph_gen_vsh_prog_glsl(uint16_t version,
                   const uint32_t *tokens,
                   unsigned int length,
                   bool vulkan,
                   MString *header, MString *body)
{

    mstring_append(header, vsh_header);

    bool has_final = false;
    int slot;

    for (slot=0; slot < length; slot++) {
        const uint32_t* cur_token = &tokens[slot * VSH_TOKEN_SIZE];
        MString *token_str = decode_token(cur_token);
        mstring_append_fmt(body,
                           "  /* Slot %d: 0x%08X 0x%08X 0x%08X 0x%08X */",
                           slot,
                           cur_token[0],cur_token[1],cur_token[2],cur_token[3]);
        mstring_append(body, "\n");
        mstring_append(body, mstring_get_str(token_str));
        mstring_append(body, "\n");
        mstring_unref(token_str);

        if (vsh_get_field(cur_token, FLD_FINAL)) {
            has_final = true;
            break;
        }
    }
    assert(has_final);

    mstring_append(body,
        /* the shaders leave the result in screen space, while
         * opengl expects it in clip space.
         * TODO: the pixel-center co-ordinate differences should handled
         */
        "  oPos.x = 2.0 * (oPos.x - surfaceSize.x * 0.5) / surfaceSize.x;\n"
        );

    if (vulkan) {
        mstring_append(body,
                       "  oPos.y = 2.0 * oPos.y / surfaceSize.y - 1.0;\n");
    } else {
        mstring_append(body, "  oPos.y = -2.0 * (oPos.y - surfaceSize.y * 0.5) "
                             "/ surfaceSize.y;\n");
    }

    mstring_append(body,
        "  oPos.z = oPos.z / clipRange.y;\n"
        "  oPos.w = clampAwayZeroInf(oPos.w);\n"

        /* Undo perspective divide by w.
         * Note that games may also have vertex shaders that do
         * not divide by w (such as 2D-graphics menus or overlays), but since
         * OpenGL will later on divide by the same w, we get back the same
         * screen space coordinates (perhaps with some loss of floating point
         * precision, though.)
         */
        "  oPos.xyz *= oPos.w;\n"
    );
}