project64/Source/Project64-video/TextureEnhancer/tc-1.1+/fxt1.c

// Project64 - A Nintendo 64 emulator
// https://www.pj64-emu.com/
// Copyright(C) 2001-2021 Project64
// Copyright(C) 2007 Hiroshi Morii
// Copyright(C) 2004 Daniel Borca
// GNU/GPLv2 licensed: https://gnu.org/licenses/gpl-2.0.html

#include <stdlib.h>
#include <string.h>

#include "types.h"
#include "internal.h"
#include "fxt1.h"

/*
FXT1 encoder
The encoder was built by reversing the decoder,
and is vaguely based on Texus2 by 3DFX. Note that this code
is merely a proof of concept, since it is highly unoptimized;
Moreover, it is sub-optimal due to initial conditions passed
to Lloyd's algorithm (the interpolation modes are even worse).
*/

#define MAX_COMP 4 /* ever needed maximum number of components in texel */
#define MAX_VECT 4 /* ever needed maximum number of base vectors to find */
#define N_TEXELS 32 /* number of texels in a block (always 32) */
#define LL_N_REP 50 /* number of iterations in lloyd's vq */
#define LL_RMS_D 10 /* fault tolerance (maximum delta) */
#define LL_RMS_E 255 /* fault tolerance (maximum error) */
#define ALPHA_TS 2 /* alpha threshold: (255 - ALPHA_TS) deemed opaque */
#define ISTBLACK(v) (*((dword *)(v)) == 0)
#define COPY_4UBV(DST, SRC) *((dword *)(DST)) = *((dword *)(SRC))

static int
fxt1_bestcol(float vec[][MAX_COMP], int nv,
    byte input[MAX_COMP], int nc)
{
    int i, j, best = -1;
    float err = 1e9; // Big enough

    for (j = 0; j < nv; j++) {
        float e = 0.0F;
        for (i = 0; i < nc; i++) {
            e += (vec[j][i] - input[i]) * (vec[j][i] - input[i]);
        }
        if (e < err) {
            err = e;
            best = j;
        }
    }

    return best;
}

static int
fxt1_worst(float vec[MAX_COMP],
    byte input[N_TEXELS][MAX_COMP], int nc, int n)
{
    int i, k, worst = -1;
    float err = -1.0F; // Small enough

    for (k = 0; k < n; k++) {
        float e = 0.0F;
        for (i = 0; i < nc; i++) {
            e += (vec[i] - input[k][i]) * (vec[i] - input[k][i]);
        }
        if (e > err) {
            err = e;
            worst = k;
        }
    }

    return worst;
}

static int
fxt1_variance(double variance[MAX_COMP],
    byte input[N_TEXELS][MAX_COMP], int nc, int n)
{
    int i, k, best = 0;
    dword sx, sx2;
    double var, maxvar = -1; // Small enough
    double teenth = 1.0 / n;

    for (i = 0; i < nc; i++) {
        sx = sx2 = 0;
        for (k = 0; k < n; k++) {
            int t = input[k][i];
            sx += t;
            sx2 += t * t;
        }
        var = sx2 * teenth - sx * sx * teenth * teenth;
        if (maxvar < var) {
            maxvar = var;
            best = i;
        }
        if (variance) {
            variance[i] = var;
        }
    }

    return best;
}

static int
fxt1_choose(float vec[][MAX_COMP], int nv,
    byte input[N_TEXELS][MAX_COMP], int nc, int n)
{
#if 0
    // Choose colors from a grid
    int i, j;

    for (j = 0; j < nv; j++) {
        int m = j * (n - 1) / (nv - 1);
        for (i = 0; i < nc; i++) {
            vec[j][i] = input[m][i];
        }
    }
#else
	/*
	Our solution here is to find the darkest and brightest colors in
    the 8x4 tile and use those as the two representative colors.
    There are probably better algorithms to use (histogram-based).
    */
    int i, j, k;
#ifndef YUV
    int minSum = 2000; // Big enough
#else
    int minSum = 2000000;
#endif
    int maxSum = -1; // Small enough
    int minCol = 0; // phoudoin: Silent compiler!
    int maxCol = 0; // phoudoin: Silent compiler!

    struct {
        int flag;
        dword key;
        int freq;
        int idx;
    } hist[N_TEXELS];
    int lenh = 0;

    memset(hist, 0, sizeof(hist));

    for (k = 0; k < n; k++) {
        int l;
        dword key = 0;
        int sum = 0;
        for (i = 0; i < nc; i++) {
            key <<= 8;
            key |= input[k][i];
#ifndef YUV
            sum += input[k][i];
#else
            /* RGB to YUV conversion according to CCIR 601 specs
             * Y = 0.299R+0.587G+0.114B
             * U = 0.713(R - Y) = 0.500R-0.419G-0.081B
             * V = 0.564(B - Y) = -0.169R-0.331G+0.500B
             */
            sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] + 114 * input[k][BCOMP];
#endif
        }
        for (l = 0; l < n; l++) {
            if (!hist[l].flag) {
                // Allocate new slot
                hist[l].flag = !0;
                hist[l].key = key;
                hist[l].freq = 1;
                hist[l].idx = k;
                lenh = l + 1;
                break;
            }
            else if (hist[l].key == key) {
                hist[l].freq++;
                break;
            }
        }
        if (minSum > sum) {
            minSum = sum;
            minCol = k;
        }
        if (maxSum < sum) {
            maxSum = sum;
            maxCol = k;
        }
    }

    if (lenh <= nv) {
        for (j = 0; j < lenh; j++) {
            for (i = 0; i < nc; i++) {
                vec[j][i] = (float)input[hist[j].idx][i];
            }
        }
        for (; j < nv; j++) {
            for (i = 0; i < nc; i++) {
                vec[j][i] = vec[0][i];
            }
        }
        return 0;
    }

    for (j = 0; j < nv; j++) {
        for (i = 0; i < nc; i++) {
            vec[j][i] = ((nv - 1 - j) * input[minCol][i] + j * input[maxCol][i] + (nv - 1) / 2) / (float)(nv - 1);
        }
    }
#endif

    return !0;
}

static int
fxt1_lloyd(float vec[][MAX_COMP], int nv,
    byte input[N_TEXELS][MAX_COMP], int nc, int n)
{
    /*
	Use the generalized Lloyd's algorithm for VQ:
    find 4 color vectors.

    For each sample color,
    sort to nearest vector.

    Replace each vector with the centroid of it's matching colors.

    Repeat until RMS doesn't improve.

    If a color vector has no samples, or becomes the same as another
    vector, replace it with the color which is farthest from a sample.

    vec[][MAX_COMP]           Initial vectors and resulting colors
    nv                        Number of resulting colors required
    input[N_TEXELS][MAX_COMP] Input texels
    nc                        Number of components in input / vec
    n                         Number of input samples
    */

    int sum[MAX_VECT][MAX_COMP]; // Used to accumulate closest texels
    int cnt[MAX_VECT]; // How many times a certain vector was chosen
    float error, lasterror = 1e9;

    int i, j, k, rep;

    // The quantizer
    for (rep = 0; rep < LL_N_REP; rep++) {
        // Reset sums and counters
        for (j = 0; j < nv; j++) {
            for (i = 0; i < nc; i++) {
                sum[j][i] = 0;
            }
            cnt[j] = 0;
        }
        error = 0;

        // Scan whole block
        for (k = 0; k < n; k++) {
#if 1
            int best = -1;
            float err = 1e9; // Big enough
            // Determine best vector
            for (j = 0; j < nv; j++) {
                float e = (vec[j][0] - input[k][0]) * (vec[j][0] - input[k][0]) +
                    (vec[j][1] - input[k][1]) * (vec[j][1] - input[k][1]) +
                    (vec[j][2] - input[k][2]) * (vec[j][2] - input[k][2]);
                if (nc == 4) {
                    e += (vec[j][3] - input[k][3]) * (vec[j][3] - input[k][3]);
                }
                if (e < err) {
                    err = e;
                    best = j;
                }
            }
#else
            int best = fxt1_bestcol(vec, nv, input[k], nc, &err);
#endif
            // Add in closest color
            for (i = 0; i < nc; i++) {
                sum[best][i] += input[k][i];
            }
            // Mark this vector as used
            cnt[best]++;
            // Accumulate error
            error += err;
        }

        // Check RMS
        if ((error < LL_RMS_E) ||
            ((error < lasterror) && ((lasterror - error) < LL_RMS_D))) {
            return !0; // Good match
        }
        lasterror = error;

        // Move each vector to the barycenter of its closest colors
        for (j = 0; j < nv; j++) {
            if (cnt[j]) {
                float div = 1.0F / cnt[j];
                for (i = 0; i < nc; i++) {
                    vec[j][i] = div * sum[j][i];
                }
            }
            else {
                // This vector has no samples or is identical with a previous vector
                int worst = fxt1_worst(vec[j], input, nc, n);
                for (i = 0; i < nc; i++) {
                    vec[j][i] = input[worst][i];
                }
            }
        }
    }

    return 0; // Could not converge fast enough
}

static void
fxt1_quantize_CHROMA(dword *cc,
    byte input[N_TEXELS][MAX_COMP])
{
    const int n_vect = 4; // 4 base vectors to find
    const int n_comp = 3; // 3 components: R, G, B
    float vec[MAX_VECT][MAX_COMP];
    int i, j, k;
    qword hi; // High quadword
    dword lohi, lolo; // Low quadword: hi DWORD, lo DWORD

    if (fxt1_choose(vec, n_vect, input, n_comp, N_TEXELS) != 0) {
        fxt1_lloyd(vec, n_vect, input, n_comp, N_TEXELS);
    }

    Q_MOV32(hi, 4); // cc-chroma = "010" + unused bit
    for (j = n_vect - 1; j >= 0; j--) {
        for (i = 0; i < n_comp; i++) {
            // Add in colors
            Q_SHL(hi, 5);
            Q_OR32(hi, (dword)(vec[j][i] / 8.0F));
        }
    }
    ((qword *)cc)[1] = hi;

    lohi = lolo = 0;
    // Right microtile
    for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
        lohi <<= 2;
        lohi |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
    }
    // Left microtile
    for (; k >= 0; k--) {
        lolo <<= 2;
        lolo |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
    }
    cc[1] = lohi;
    cc[0] = lolo;
}

static void
fxt1_quantize_ALPHA0(dword *cc,
    byte input[N_TEXELS][MAX_COMP],
    byte reord[N_TEXELS][MAX_COMP], int n)
{
    const int n_vect = 3; // 3 base vectors to find
    const int n_comp = 4; // 4 components: R, G, B, A
    float vec[MAX_VECT][MAX_COMP];
    int i, j, k;
    qword hi; // High quadword
    dword lohi, lolo; // Low quadword: hi DWORD, lo DWORD

    // The last vector indicates zero
    for (i = 0; i < n_comp; i++) {
        vec[n_vect][i] = 0;
    }

    // The first N texels in reord are guaranteed to be non-zero
    if (fxt1_choose(vec, n_vect, reord, n_comp, n) != 0) {
        fxt1_lloyd(vec, n_vect, reord, n_comp, n);
    }

    Q_MOV32(hi, 6); // Alpha = "011" + lerp = 0
    for (j = n_vect - 1; j >= 0; j--) {
        // Add in alphas
        Q_SHL(hi, 5);
        Q_OR32(hi, (dword)(vec[j][ACOMP] / 8.0F));
    }
    for (j = n_vect - 1; j >= 0; j--) {
        for (i = 0; i < n_comp - 1; i++) {
            // Add in colors
            Q_SHL(hi, 5);
            Q_OR32(hi, (dword)(vec[j][i] / 8.0F));
        }
    }
    ((qword *)cc)[1] = hi;

    lohi = lolo = 0;
    // Right microtile
    for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
        lohi <<= 2;
        lohi |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
    }
    // Left microtile
    for (; k >= 0; k--) {
        lolo <<= 2;
        lolo |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
    }
    cc[1] = lohi;
    cc[0] = lolo;
}

static void
fxt1_quantize_ALPHA1(dword *cc,
    byte input[N_TEXELS][MAX_COMP])
{
    const int n_vect = 3; // Highest vector number in each microtile
    const int n_comp = 4; // 4 components: R, G, B, A
    float vec[1 + 1 + 1][MAX_COMP]; // 1.5 extrema for each sub-block
    float b, iv[MAX_COMP]; // Interpolation vector
    int i, j, k;
    qword hi; // High quadword
    dword lohi, lolo; // Low quadword: hi DWORD, lo DWORD

    int minSum;
    int maxSum;
    int minColL = 0, maxColL = 0;
    int minColR = 0, maxColR = 0;
    int sumL = 0, sumR = 0;

    /*
	TODO: Our solution here is to find the darkest and brightest colors in
    the 4x4 tile and use those as the two representative colors.
    There are probably better algorithms to use (histogram-based).
    */

#ifndef YUV
    minSum = 2000; // Big enough
#else
    minSum = 2000000;
#endif
    maxSum = -1; // Small enough
    for (k = 0; k < N_TEXELS / 2; k++) {
        int sum = 0;
#ifndef YUV
        for (i = 0; i < n_comp; i++) {
            sum += input[k][i];
        }
#else
        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] + 114 * input[k][BCOMP];
#endif
        if (minSum > sum) {
            minSum = sum;
            minColL = k;
        }
        if (maxSum < sum) {
            maxSum = sum;
            maxColL = k;
        }
        sumL += sum;
    }
#ifndef YUV
    minSum = 2000; // Big enough
#else
    minSum = 2000000;
#endif
    maxSum = -1; // Small enough
    for (; k < N_TEXELS; k++) {
        int sum = 0;
#ifndef YUV
        for (i = 0; i < n_comp; i++) {
            sum += input[k][i];
        }
#else
        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] + 114 * input[k][BCOMP];
#endif
        if (minSum > sum) {
            minSum = sum;
            minColR = k;
        }
        if (maxSum < sum) {
            maxSum = sum;
            maxColR = k;
        }
        sumR += sum;
    }

    // Choose the common vector
    {
        int j1, j2;
        int v1 = 0, v2 = 0;
        float err = 1e9; // Big enough
        float tv[2 * 2][MAX_COMP]; // 2 extrema for each sub-block
        for (i = 0; i < n_comp; i++) {
            tv[0][i] = input[minColL][i];
            tv[1][i] = input[maxColL][i];
            tv[2][i] = input[minColR][i];
            tv[3][i] = input[maxColR][i];
        }
        for (j1 = 0; j1 < 2; j1++) {
            for (j2 = 2; j2 < 4; j2++) {
                float e = 0.0F;
                for (i = 0; i < n_comp; i++) {
                    e += (tv[j1][i] - tv[j2][i]) * (tv[j1][i] - tv[j2][i]);
                }
                if (e < err) {
                    err = e;
                    v1 = j1;
                    v2 = j2;
                }
            }
        }
        for (i = 0; i < n_comp; i++) {
            vec[0][i] = tv[1 - v1][i];
            vec[1][i] = (tv[v1][i] * sumL + tv[v2][i] * sumR) / (sumL + sumR);
            vec[2][i] = tv[5 - v2][i];
        }
    }

    // Left microtile
    cc[0] = 0;
    if (minColL != maxColL) {
        // Compute interpolation vector
        MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);

        // Add in texels
        lolo = 0;
        for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
            int texel;
            // Interpolate color
            CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
            // Add in texel
            lolo <<= 2;
            lolo |= texel;
        }

        cc[0] = lolo;
    }

    // Right microtile
    cc[1] = 0;
    if (minColR != maxColR) {
        // Compute interpolation vector
        MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[1]);

        // Add in texels
        lohi = 0;
        for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
            int texel;
            // Interpolate color
            CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
            // Add in texel
            lohi <<= 2;
            lohi |= texel;
        }

        cc[1] = lohi;
    }

    Q_MOV32(hi, 7); // Alpha = "011" + lerp = 1
    for (j = n_vect - 1; j >= 0; j--) {
        // Add in alphas
        Q_SHL(hi, 5);
        Q_OR32(hi, (dword)(vec[j][ACOMP] / 8.0F));
    }
    for (j = n_vect - 1; j >= 0; j--) {
        for (i = 0; i < n_comp - 1; i++) {
            // Add in colors
            Q_SHL(hi, 5);
            Q_OR32(hi, (dword)(vec[j][i] / 8.0F));
        }
    }
    ((qword *)cc)[1] = hi;
}

static void
fxt1_quantize_HI(dword *cc,
    byte input[N_TEXELS][MAX_COMP],
    byte reord[N_TEXELS][MAX_COMP], int n)
{
    const int n_vect = 6; // Highest vector number
    const int n_comp = 3; // 3 components: R, G, B
    float b = 0.0F;       // phoudoin: Silent compiler!
    float iv[MAX_COMP];   // Interpolation vector
    int i, k;
    dword hihi; // High quadword: hi DWORD

#ifndef YUV
    int minSum = 2000; // Big enough
#else
    int minSum = 2000000;
#endif
    int maxSum = -1; // Small enough
    int minCol = 0; // phoudoin: Silent compiler!
    int maxCol = 0; // phoudoin: Silent compiler!

    /*
	Our solution here is to find the darkest and brightest colors in
    the 8x4 tile and use those as the two representative colors.
    There are probably better algorithms to use (histogram-based).
    */

    for (k = 0; k < n; k++) {
        int sum = 0;
#ifndef YUV
        for (i = 0; i < n_comp; i++) {
            sum += reord[k][i];
        }
#else
        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] + 114 * input[k][BCOMP];
#endif
        if (minSum > sum) {
            minSum = sum;
            minCol = k;
        }
        if (maxSum < sum) {
            maxSum = sum;
            maxCol = k;
        }
    }

    hihi = 0; // cc-hi = "00"
    for (i = 0; i < n_comp; i++) {
        // Add in colors
        hihi <<= 5;
        hihi |= reord[maxCol][i] >> 3;
    }
    for (i = 0; i < n_comp; i++) {
        // Add in colors
        hihi <<= 5;
        hihi |= reord[minCol][i] >> 3;
    }
    cc[3] = hihi;
    cc[0] = cc[1] = cc[2] = 0;

    // Compute interpolation vector
    if (minCol != maxCol) {
        MAKEIVEC(n_vect, n_comp, iv, b, reord[minCol], reord[maxCol]);
    }

    // Add in texels
    for (k = N_TEXELS - 1; k >= 0; k--) {
        int t = k * 3;
        dword *kk = (dword *)((byte *)cc + t / 8);
        int texel = n_vect + 1; // Transparent black

        if (!ISTBLACK(input[k])) {
            if (minCol != maxCol) {
                // Interpolate color
                CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
                // Add in texel
                kk[0] |= texel << (t & 7);
            }
        }
        else {
            // Add in texel
            kk[0] |= texel << (t & 7);
        }
    }
}

static void
fxt1_quantize_MIXED1(dword *cc,
    byte input[N_TEXELS][MAX_COMP])
{
    const int n_vect = 2; // Highest vector number in each microtile
    const int n_comp = 3; // 3 components: R, G, B
    byte vec[2 * 2][MAX_COMP]; // 2 extrema for each sub-block
    float b, iv[MAX_COMP]; // Interpolation vector
    int i, j, k;
    qword hi; // High quadword
    dword lohi, lolo; // Low quadword: hi DWORD, lo DWORD

    int minSum;
    int maxSum;
    int minColL = 0, maxColL = -1;
    int minColR = 0, maxColR = -1;

    /*
	Our solution here is to find the darkest and brightest colors in
    the 4x4 tile and use those as the two representative colors.
    There are probably better algorithms to use (histogram-based).
    */

#ifndef YUV
    minSum = 2000; // Big enough
#else
    minSum = 2000000;
#endif
    maxSum = -1; // Small enough
    for (k = 0; k < N_TEXELS / 2; k++) {
        if (!ISTBLACK(input[k])) {
            int sum = 0;
#ifndef YUV
            for (i = 0; i < n_comp; i++) {
                sum += input[k][i];
            }
#else
            sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] + 114 * input[k][BCOMP];
#endif
            if (minSum > sum) {
                minSum = sum;
                minColL = k;
            }
            if (maxSum < sum) {
                maxSum = sum;
                maxColL = k;
            }
        }
    }
#ifndef YUV
    minSum = 2000; // Big enough
#else
    minSum = 2000000;
#endif
    maxSum = -1; // Small enough
    for (; k < N_TEXELS; k++) {
        if (!ISTBLACK(input[k])) {
            int sum = 0;
#ifndef YUV
            for (i = 0; i < n_comp; i++) {
                sum += input[k][i];
            }
#else
            sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] + 114 * input[k][BCOMP];
#endif
            if (minSum > sum) {
                minSum = sum;
                minColR = k;
            }
            if (maxSum < sum) {
                maxSum = sum;
                maxColR = k;
            }
        }
    }

    // Left microtile
    if (maxColL == -1) {
        // All transparent black
        cc[0] = 0xFFFFFFFFUL;
        for (i = 0; i < n_comp; i++) {
            vec[0][i] = 0;
            vec[1][i] = 0;
        }
    }
    else {
        cc[0] = 0;
        for (i = 0; i < n_comp; i++) {
            vec[0][i] = input[minColL][i];
            vec[1][i] = input[maxColL][i];
        }
        if (minColL != maxColL) {
            // Compute interpolation vector
            MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);

            // Add in texels
            lolo = 0;
            for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
                int texel = n_vect + 1;	// Transparent black
                if (!ISTBLACK(input[k])) {
                    // Interpolate color
                    CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
                }
                // Add in texel
                lolo <<= 2;
                lolo |= texel;
            }
            cc[0] = lolo;
        }
    }

    // Right microtile
    if (maxColR == -1) {
        // All transparent black
        cc[1] = 0xFFFFFFFFUL;
        for (i = 0; i < n_comp; i++) {
            vec[2][i] = 0;
            vec[3][i] = 0;
        }
    }
    else {
        cc[1] = 0;
        for (i = 0; i < n_comp; i++) {
            vec[2][i] = input[minColR][i];
            vec[3][i] = input[maxColR][i];
        }
        if (minColR != maxColR) {
            // Compute interpolation vector
            MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);

            // Add in texels
            lohi = 0;
            for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
                int texel = n_vect + 1;	// Transparent black
                if (!ISTBLACK(input[k])) {
                    // Interpolate color
                    CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
                }
                // Add in texel
                lohi <<= 2;
                lohi |= texel;
            }
            cc[1] = lohi;
        }
    }

    Q_MOV32(hi, 9 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); // Chroma = "1"
    for (j = 2 * 2 - 1; j >= 0; j--) {
        for (i = 0; i < n_comp; i++) {
            // Add in colors
            Q_SHL(hi, 5);
            Q_OR32(hi, vec[j][i] >> 3);
        }
    }
    ((qword *)cc)[1] = hi;
}

static void
fxt1_quantize_MIXED0(dword *cc,
    byte input[N_TEXELS][MAX_COMP])
{
    const int n_vect = 3; // Highest vector number in each microtile
    const int n_comp = 3; // 3 components: R, G, B
    byte vec[2 * 2][MAX_COMP]; // 2 extrema for each sub-block
    float b, iv[MAX_COMP]; // Interpolation vector
    int i, j, k;
    qword hi; // High quadword
    dword lohi, lolo; // Low quadword: hi DWORD, lo DWORD

    int minColL = 0, maxColL = 0;
    int minColR = 0, maxColR = 0;
#if 0
    int minSum;
    int maxSum;

    /*
	Our solution here is to find the darkest and brightest colors in
    the 4x4 tile and use those as the two representative colors.
    There are probably better algorithms to use (histogram-based).
    */

#ifndef YUV
    minSum = 2000; // Big enough
#else
    minSum = 2000000;
#endif
    maxSum = -1; // Small enough
    for (k = 0; k < N_TEXELS / 2; k++) {
        int sum = 0;
#ifndef YUV
        for (i = 0; i < n_comp; i++) {
            sum += input[k][i];
        }
#else
        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] + 114 * input[k][BCOMP];
#endif
        if (minSum > sum) {
            minSum = sum;
            minColL = k;
        }
        if (maxSum < sum) {
            maxSum = sum;
            maxColL = k;
        }
    }
    minSum = 2000; // Big enough
    maxSum = -1; // Small enough
    for (; k < N_TEXELS; k++) {
        int sum = 0;
#ifndef YUV
        for (i = 0; i < n_comp; i++) {
            sum += input[k][i];
        }
#else
        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] + 114 * input[k][BCOMP];
#endif
        if (minSum > sum) {
            minSum = sum;
            minColR = k;
        }
        if (maxSum < sum) {
            maxSum = sum;
            maxColR = k;
        }
    }
#else
    int minVal;
    int maxVal;
    int maxVarL = fxt1_variance(NULL, input, n_comp, N_TEXELS / 2);
    int maxVarR = fxt1_variance(NULL, &input[N_TEXELS / 2], n_comp, N_TEXELS / 2);

    /*
	Scan the channel with max variance for lo and hi
    and use those as the two representative colors.
    */

    minVal = 2000; // Big enough
    maxVal = -1; // Small enough
    for (k = 0; k < N_TEXELS / 2; k++) {
        int t = input[k][maxVarL];
        if (minVal > t) {
            minVal = t;
            minColL = k;
        }
        if (maxVal < t) {
            maxVal = t;
            maxColL = k;
        }
    }
    minVal = 2000; // Big enough
    maxVal = -1; // Small enough
    for (; k < N_TEXELS; k++) {
        int t = input[k][maxVarR];
        if (minVal > t) {
            minVal = t;
            minColR = k;
        }
        if (maxVal < t) {
            maxVal = t;
            maxColR = k;
        }
    }
#endif

    // Left microtile
    cc[0] = 0;
    for (i = 0; i < n_comp; i++) {
        vec[0][i] = input[minColL][i];
        vec[1][i] = input[maxColL][i];
    }
    if (minColL != maxColL) {
        // Compute interpolation vector
        MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);

        // Add in texels
        lolo = 0;
        for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
            int texel;
            // Interpolate color
            CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
            // Add in texel
            lolo <<= 2;
            lolo |= texel;
        }

        // Funky encoding for LSB of green
        if ((int)((lolo >> 1) & 1) != (((vec[1][GCOMP] ^ vec[0][GCOMP]) >> 2) & 1)) {
            for (i = 0; i < n_comp; i++) {
                vec[1][i] = input[minColL][i];
                vec[0][i] = input[maxColL][i];
            }
            lolo = ~lolo;
        }

        cc[0] = lolo;
    }

    // Right microtile
    cc[1] = 0;
    for (i = 0; i < n_comp; i++) {
        vec[2][i] = input[minColR][i];
        vec[3][i] = input[maxColR][i];
    }
    if (minColR != maxColR) {
        // Compute interpolation vector
        MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);

        // Add in texels
        lohi = 0;
        for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
            int texel;
            // Interpolate color
            CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
            // Add in texel
            lohi <<= 2;
            lohi |= texel;
        }

        // Funky encoding for LSB of green
        if ((int)((lohi >> 1) & 1) != (((vec[3][GCOMP] ^ vec[2][GCOMP]) >> 2) & 1)) {
            for (i = 0; i < n_comp; i++) {
                vec[3][i] = input[minColR][i];
                vec[2][i] = input[maxColR][i];
            }
            lohi = ~lohi;
        }

        cc[1] = lohi;
    }

    Q_MOV32(hi, 8 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); // Chroma = "1"
    for (j = 2 * 2 - 1; j >= 0; j--) {
        for (i = 0; i < n_comp; i++) {
            // Add in colors
            Q_SHL(hi, 5);
            Q_OR32(hi, vec[j][i] >> 3);
        }
    }
    ((qword *)cc)[1] = hi;
}

static void
fxt1_quantize(dword *cc, const byte *lines[], int comps)
{
    int trualpha;
    byte reord[N_TEXELS][MAX_COMP];

    byte input[N_TEXELS][MAX_COMP];
#ifndef ARGB
    int i;
#endif
    int k, l;

    if (comps == 3) {
        // Make the whole block opaque
        memset(input, -1, sizeof(input));
    }

    // 8 texels each line
#ifndef ARGB
    for (l = 0; l < 4; l++) {
        for (k = 0; k < 4; k++) {
            for (i = 0; i < comps; i++) {
                input[k + l * 4][i] = *lines[l]++;
            }
        }
        for (; k < 8; k++) {
            for (i = 0; i < comps; i++) {
                input[k + l * 4 + 12][i] = *lines[l]++;
            }
        }
    }
#else
    // H.Morii - support for ARGB inputs
    for (l = 0; l < 4; l++) {
        for (k = 0; k < 4; k++) {
            input[k + l * 4][2] = *lines[l]++;
            input[k + l * 4][1] = *lines[l]++;
            input[k + l * 4][0] = *lines[l]++;
            if (comps == 4) input[k + l * 4][3] = *lines[l]++;
        }
        for (; k < 8; k++) {
            input[k + l * 4 + 12][2] = *lines[l]++;
            input[k + l * 4 + 12][1] = *lines[l]++;
            input[k + l * 4 + 12][0] = *lines[l]++;
            if (comps == 4) input[k + l * 4 + 12][3] = *lines[l]++;
        }
    }
#endif

    /* Block layout:
    00, 01, 02, 03, 08, 09, 0a, 0b
    10, 11, 12, 13, 18, 19, 1a, 1b
    04, 05, 06, 07, 0c, 0d, 0e, 0f
    14, 15, 16, 17, 1c, 1d, 1e, 1f
    */

    /* [dBorca]
    Stupidity flows forth from this
    */

    l = N_TEXELS;
    trualpha = 0;
    if (comps == 4) {
        // Skip all transparent black texels
        l = 0;
        for (k = 0; k < N_TEXELS; k++) {
            // Test all components against 0
            if (!ISTBLACK(input[k])) {
                // Texel is not transparent black
                COPY_4UBV(reord[l], input[k]);
                if (reord[l][ACOMP] < (255 - ALPHA_TS)) {
                    // Non-opaque texel
                    trualpha = !0;
                }
                l++;
            }
        }
    }

#if 0
    if (trualpha) {
        fxt1_quantize_ALPHA0(cc, input, reord, l);
    }
    else if (l == 0) {
        cc[0] = cc[1] = cc[2] = -1;
        cc[3] = 0;
    }
    else if (l < N_TEXELS) {
        fxt1_quantize_HI(cc, input, reord, l);
    }
    else {
        fxt1_quantize_CHROMA(cc, input);
    }
    (void)fxt1_quantize_ALPHA1;
    (void)fxt1_quantize_MIXED1;
    (void)fxt1_quantize_MIXED0;
#else
    if (trualpha) {
        fxt1_quantize_ALPHA1(cc, input);
    }
    else if (l == 0) {
        cc[0] = cc[1] = cc[2] = 0xFFFFFFFFUL;
        cc[3] = 0;
    }
    else if (l < N_TEXELS) {
        fxt1_quantize_MIXED1(cc, input);
    }
    else {
        fxt1_quantize_MIXED0(cc, input);
    }
    (void)fxt1_quantize_ALPHA0;
    (void)fxt1_quantize_HI;
    (void)fxt1_quantize_CHROMA;
#endif
}

TAPI int TAPIENTRY
fxt1_encode(int width, int height, int comps,
    const void *source, int srcRowStride,
    void *dest, int destRowStride)
{
    int x, y;
    const byte *data;
    dword *encoded = (dword *)dest;
    void *newSource = NULL;

    // Replicate image if width is not M8 or height is not M4
    if ((width & 7) | (height & 3)) {
        int newWidth = (width + 7) & ~7;
        int newHeight = (height + 3) & ~3;
        newSource = malloc(comps * newWidth * newHeight * sizeof(byte *));
        _mesa_upscale_teximage2d(width, height, newWidth, newHeight,
            comps, (const byte *)source,
            srcRowStride, (byte *)newSource);
        source = newSource;
        width = newWidth;
        height = newHeight;
        srcRowStride = comps * newWidth;
    }

    data = (const byte *)source;
    destRowStride = (destRowStride - width * 2) / 4;
    for (y = 0; y < height; y += 4) {
        unsigned int offs = 0 + (y + 0) * srcRowStride;
        for (x = 0; x < width; x += 8) {
            const byte *lines[4];
            lines[0] = &data[offs];
            lines[1] = lines[0] + srcRowStride;
            lines[2] = lines[1] + srcRowStride;
            lines[3] = lines[2] + srcRowStride;
            offs += 8 * comps;
            fxt1_quantize(encoded, lines, comps);
            // 128 bits per 8x4 block
            encoded += 4;
        }
        encoded += destRowStride;
    }

    if (newSource != NULL) {
        free(newSource);
    }

    return 0;
}

/*
FXT1 decoder
The decoder is based on GL_3DFX_texture_compression_FXT1
specification and serves as a concept for the encoder.
*/

// Lookup table for scaling 5-bit color up to 8-bit color
static const byte _rgb_scale_5[] = {
    0,   8,   16,  25,  33,  41,  49,  58,
    66,  74,  82,  90,  99,  107, 115, 123,
    132, 140, 148, 156, 165, 173, 181, 189,
    197, 206, 214, 222, 230, 239, 247, 255
};

// Lookup table for scaling 6-bit color up to 8-bit color
static const byte _rgb_scale_6[] = {
    0,   4,   8,   12,  16,  20,  24,  28,
    32,  36,  40,  45,  49,  53,  57,  61,
    65,  69,  73,  77,  81,  85,  89,  93,
    97,  101, 105, 109, 113, 117, 121, 125,
    130, 134, 138, 142, 146, 150, 154, 158,
    162, 166, 170, 174, 178, 182, 186, 190,
    194, 198, 202, 206, 210, 215, 219, 223,
    227, 231, 235, 239, 243, 247, 251, 255
};

#define CC_SEL(cc, which) (((dword *)(cc))[(which) / 32] >> ((which) & 31))
#define UP5(c) _rgb_scale_5[(c) & 31]
#define UP6(c, b) _rgb_scale_6[(((c) & 31) << 1) | ((b) & 1)]
#define LERP(n, t, c0, c1) (((n) - (t)) * (c0) + (t) * (c1) + (n) / 2) / (n)
#define ZERO_4UBV(v) *((dword *)(v)) = 0

static void
fxt1_decode_1HI(const byte *code, int t, byte *rgba)
{
    const dword *cc;

    t *= 3;
    cc = (const dword *)(code + t / 8);
    t = (cc[0] >> (t & 7)) & 7;

    if (t == 7) {
        ZERO_4UBV(rgba);
    }
    else {
        cc = (const dword *)(code + 12);
        if (t == 0) {
            rgba[BCOMP] = UP5(CC_SEL(cc, 0));
            rgba[GCOMP] = UP5(CC_SEL(cc, 5));
            rgba[RCOMP] = UP5(CC_SEL(cc, 10));
        }
        else if (t == 6) {
            rgba[BCOMP] = UP5(CC_SEL(cc, 15));
            rgba[GCOMP] = UP5(CC_SEL(cc, 20));
            rgba[RCOMP] = UP5(CC_SEL(cc, 25));
        }
        else {
            rgba[BCOMP] = LERP(6, t, UP5(CC_SEL(cc, 0)), UP5(CC_SEL(cc, 15)));
            rgba[GCOMP] = LERP(6, t, UP5(CC_SEL(cc, 5)), UP5(CC_SEL(cc, 20)));
            rgba[RCOMP] = LERP(6, t, UP5(CC_SEL(cc, 10)), UP5(CC_SEL(cc, 25)));
        }
        rgba[ACOMP] = 255;
    }
}

static void
fxt1_decode_1CHROMA(const byte *code, int t, byte *rgba)
{
    const dword *cc;
    dword kk;

    cc = (const dword *)code;
    if (t & 16) {
        cc++;
        t &= 15;
    }
    t = (cc[0] >> (t * 2)) & 3;

    t *= 15;
    cc = (const dword *)(code + 8 + t / 8);
    kk = cc[0] >> (t & 7);
    rgba[BCOMP] = UP5(kk);
    rgba[GCOMP] = UP5(kk >> 5);
    rgba[RCOMP] = UP5(kk >> 10);
    rgba[ACOMP] = 255;
}

static void
fxt1_decode_1MIXED(const byte *code, int t, byte *rgba)
{
    const dword *cc;
    int col[2][3];
    int glsb, selb;

    cc = (const dword *)code;
    if (t & 16) {
        t &= 15;
        t = (cc[1] >> (t * 2)) & 3;
        // Col 2
        col[0][BCOMP] = (*(const dword *)(code + 11)) >> 6;
        col[0][GCOMP] = CC_SEL(cc, 99);
        col[0][RCOMP] = CC_SEL(cc, 104);
        // Col 3
        col[1][BCOMP] = CC_SEL(cc, 109);
        col[1][GCOMP] = CC_SEL(cc, 114);
        col[1][RCOMP] = CC_SEL(cc, 119);
        glsb = CC_SEL(cc, 126);
        selb = CC_SEL(cc, 33);
    }
    else {
        t = (cc[0] >> (t * 2)) & 3;
        // Col 0
        col[0][BCOMP] = CC_SEL(cc, 64);
        col[0][GCOMP] = CC_SEL(cc, 69);
        col[0][RCOMP] = CC_SEL(cc, 74);
        // Col 1
        col[1][BCOMP] = CC_SEL(cc, 79);
        col[1][GCOMP] = CC_SEL(cc, 84);
        col[1][RCOMP] = CC_SEL(cc, 89);
        glsb = CC_SEL(cc, 125);
        selb = CC_SEL(cc, 1);
    }

    if (CC_SEL(cc, 124) & 1) {
        // alpha[0] == 1

        if (t == 3) {
            ZERO_4UBV(rgba);
        }
        else {
            if (t == 0) {
                rgba[BCOMP] = UP5(col[0][BCOMP]);
                rgba[GCOMP] = UP5(col[0][GCOMP]);
                rgba[RCOMP] = UP5(col[0][RCOMP]);
            }
            else if (t == 2) {
                rgba[BCOMP] = UP5(col[1][BCOMP]);
                rgba[GCOMP] = UP6(col[1][GCOMP], glsb);
                rgba[RCOMP] = UP5(col[1][RCOMP]);
            }
            else {
                rgba[BCOMP] = (UP5(col[0][BCOMP]) + UP5(col[1][BCOMP])) / 2;
                rgba[GCOMP] = (UP5(col[0][GCOMP]) + UP6(col[1][GCOMP], glsb)) / 2;
                rgba[RCOMP] = (UP5(col[0][RCOMP]) + UP5(col[1][RCOMP])) / 2;
            }
            rgba[ACOMP] = 255;
        }
    }
    else {
        // alpha[0] == 0

        if (t == 0) {
            rgba[BCOMP] = UP5(col[0][BCOMP]);
            rgba[GCOMP] = UP6(col[0][GCOMP], glsb ^ selb);
            rgba[RCOMP] = UP5(col[0][RCOMP]);
        }
        else if (t == 3) {
            rgba[BCOMP] = UP5(col[1][BCOMP]);
            rgba[GCOMP] = UP6(col[1][GCOMP], glsb);
            rgba[RCOMP] = UP5(col[1][RCOMP]);
        }
        else {
            rgba[BCOMP] = LERP(3, t, UP5(col[0][BCOMP]), UP5(col[1][BCOMP]));
            rgba[GCOMP] = LERP(3, t, UP6(col[0][GCOMP], glsb ^ selb),
                UP6(col[1][GCOMP], glsb));
            rgba[RCOMP] = LERP(3, t, UP5(col[0][RCOMP]), UP5(col[1][RCOMP]));
        }
        rgba[ACOMP] = 255;
    }
}

static void
fxt1_decode_1ALPHA(const byte *code, int t, byte *rgba)
{
    const dword *cc;

    cc = (const dword *)code;
    if (CC_SEL(cc, 124) & 1) {
        // lerp == 1
        int col0[4];

        if (t & 16) {
            t &= 15;
            t = (cc[1] >> (t * 2)) & 3;
            // Col 2
            col0[BCOMP] = (*(const dword *)(code + 11)) >> 6;
            col0[GCOMP] = CC_SEL(cc, 99);
            col0[RCOMP] = CC_SEL(cc, 104);
            col0[ACOMP] = CC_SEL(cc, 119);
        }
        else {
            t = (cc[0] >> (t * 2)) & 3;
            // Col 0
            col0[BCOMP] = CC_SEL(cc, 64);
            col0[GCOMP] = CC_SEL(cc, 69);
            col0[RCOMP] = CC_SEL(cc, 74);
            col0[ACOMP] = CC_SEL(cc, 109);
        }

        if (t == 0) {
            rgba[BCOMP] = UP5(col0[BCOMP]);
            rgba[GCOMP] = UP5(col0[GCOMP]);
            rgba[RCOMP] = UP5(col0[RCOMP]);
            rgba[ACOMP] = UP5(col0[ACOMP]);
        }
        else if (t == 3) {
            rgba[BCOMP] = UP5(CC_SEL(cc, 79));
            rgba[GCOMP] = UP5(CC_SEL(cc, 84));
            rgba[RCOMP] = UP5(CC_SEL(cc, 89));
            rgba[ACOMP] = UP5(CC_SEL(cc, 114));
        }
        else {
            rgba[BCOMP] = LERP(3, t, UP5(col0[BCOMP]), UP5(CC_SEL(cc, 79)));
            rgba[GCOMP] = LERP(3, t, UP5(col0[GCOMP]), UP5(CC_SEL(cc, 84)));
            rgba[RCOMP] = LERP(3, t, UP5(col0[RCOMP]), UP5(CC_SEL(cc, 89)));
            rgba[ACOMP] = LERP(3, t, UP5(col0[ACOMP]), UP5(CC_SEL(cc, 114)));
        }
    }
    else {
        // lerp == 0

        if (t & 16) {
            cc++;
            t &= 15;
        }
        t = (cc[0] >> (t * 2)) & 3;

        if (t == 3) {
            ZERO_4UBV(rgba);
        }
        else {
            dword kk;
            cc = (const dword *)code;
            rgba[ACOMP] = UP5(cc[3] >> (t * 5 + 13));
            t *= 15;
            cc = (const dword *)(code + 8 + t / 8);
            kk = cc[0] >> (t & 7);
            rgba[BCOMP] = UP5(kk);
            rgba[GCOMP] = UP5(kk >> 5);
            rgba[RCOMP] = UP5(kk >> 10);
        }
    }
}

TAPI void TAPIENTRY
fxt1_decode_1(const void *texture, int stride,
    int i, int j, byte *rgba)
{
    static void(*decode_1[]) (const byte *, int, byte *) = {
    fxt1_decode_1HI,	// cc-high   = "00?"
    fxt1_decode_1HI,	// cc-high   = "00?"
    fxt1_decode_1CHROMA,	// cc-chroma = "010"
    fxt1_decode_1ALPHA,	// alpha     = "011"
    fxt1_decode_1MIXED,	// mixed     = "1??"
    fxt1_decode_1MIXED,	// mixed     = "1??"
    fxt1_decode_1MIXED,	// mixed     = "1??"
    fxt1_decode_1MIXED	// mixed     = "1??"
    };

    const byte *code = (const byte *)texture +
        ((j / 4) * (stride / 8) + (i / 8)) * 16;
    int mode = CC_SEL(code, 125);
    int t = i & 7;

    if (t & 4) {
        t += 12;
    }
    t += (j & 3) * 4;

    decode_1[mode](code, t, rgba);

#if VERBOSE
    {
        extern int cc_chroma;
        extern int cc_alpha;
        extern int cc_high;
        extern int cc_mixed;
        static int *cctype[] = {
            &cc_high,
            &cc_high,
            &cc_chroma,
            &cc_alpha,
            &cc_mixed,
            &cc_mixed,
            &cc_mixed,
            &cc_mixed
        };
        (*cctype[mode])++;
    }
#endif
}