// File: crn_dxt1.cpp
// See Copyright Notice and license at the end of inc/crnlib.h
//
// Notes:
// This class is not optimized for performance on small blocks, unlike typical DXT1 compressors. It's optimized for scalability and quality:
// - Very high quality in terms of avg. RMSE or Luma RMSE. Goal is to always match or beat every other known offline DXTc compressor: ATI_Compress, squish, NVidia texture tools, nvdxt.exe, etc.
// - Reasonable scalability and stability with hundreds to many thousands of input colors (including inputs with many thousands of equal/nearly equal colors).
// - Any quality optimization which results in even a tiny improvement is worth it -- as long as it's either a constant or linear slowdown.
//   Tiny quality improvements can be extremely valuable in large clusters.
// - Quality should scale well vs. CPU time cost, i.e. the more time you spend the higher the quality.
#include "crn_core.h"
#include "crn_dxt1.h"
#include "crn_ryg_dxt.hpp"
#include "crn_dxt_fast.h"
#include "crn_intersect.h"
#include "crn_vec_interval.h"

namespace crnlib
{
   //-----------------------------------------------------------------------------------------------------------------------------------------

   static const int16 g_fast_probe_table[] = { 0, 1, 2, 3 };
   static const uint cFastProbeTableSize = sizeof(g_fast_probe_table) / sizeof(g_fast_probe_table[0]);

   static const int16 g_normal_probe_table[] = { 0, 1, 3, 5, 7 };
   static const uint cNormalProbeTableSize = sizeof(g_normal_probe_table) / sizeof(g_normal_probe_table[0]);

   static const int16 g_better_probe_table[] = { 0, 1, 2, 3, 5, 9, 15, 19, 27, 43 };
   static const uint cBetterProbeTableSize = sizeof(g_better_probe_table) / sizeof(g_better_probe_table[0]);

   static const int16 g_uber_probe_table[] = { 0, 1, 2, 3, 5, 7, 9, 10, 13, 15, 19, 27, 43, 59, 91 };
   static const uint cUberProbeTableSize = sizeof(g_uber_probe_table) / sizeof(g_uber_probe_table[0]);

   //-----------------------------------------------------------------------------------------------------------------------------------------

   dxt1_endpoint_optimizer::dxt1_endpoint_optimizer() :
      m_pParams(NULL),
      m_pResults(NULL),
      m_pSolutions(NULL),
      m_perceptual(false),
      m_has_color_weighting(false),
      m_all_pixels_grayscale(false)
   {
      m_low_coords.reserve(512);
      m_high_coords.reserve(512);

      m_unique_colors.reserve(512);
      m_temp_unique_colors.reserve(512);
      m_unique_packed_colors.reserve(512);

      m_norm_unique_colors.reserve(512);
      m_norm_unique_colors_weighted.reserve(512);

      m_lo_cells.reserve(128);
      m_hi_cells.reserve(128);
   }

   void dxt1_endpoint_optimizer::clear()
   {
      m_pParams = NULL;
      m_pResults = NULL;
      m_pSolutions = NULL;

      if (m_unique_color_hash_map.get_table_size() > 8192)
         m_unique_color_hash_map.clear();
      else
         m_unique_color_hash_map.reset();

      if (m_solutions_tried.get_table_size() > 8192)
         m_solutions_tried.clear();

      m_unique_colors.resize(0);

      m_has_transparent_pixels = false;
      m_total_unique_color_weight = 0;

      m_norm_unique_colors.resize(0);
      m_mean_norm_color.clear();

      m_norm_unique_colors_weighted.resize(0);
      m_mean_norm_color_weighted.clear();

      m_principle_axis.clear();

      m_total_evals = 0;
      m_all_pixels_grayscale = false;
      m_has_color_weighting = false;
      m_perceptual = false;
   }

   bool dxt1_endpoint_optimizer::handle_all_transparent_block()
   {
      m_pResults->m_low_color = 0;
      m_pResults->m_high_color = 0;
      m_pResults->m_alpha_block = true;

      memset(m_pResults->m_pSelectors, 3, m_pParams->m_num_pixels);

      return true;
   }

   // All selectors are equal. Try compressing as if it was solid, using the block's average color, using ryg's optimal single color compression tables.
   bool dxt1_endpoint_optimizer::try_average_block_as_solid()
   {
      uint64 tot_r = 0;
      uint64 tot_g = 0;
      uint64 tot_b = 0;

      uint total_weight = 0;
      for (uint i = 0; i < m_unique_colors.size(); i++)
      {
         uint weight = m_unique_colors[i].m_weight;
         total_weight += weight;

         tot_r += m_unique_colors[i].m_color.r * weight;
         tot_g += m_unique_colors[i].m_color.g * weight;
         tot_b += m_unique_colors[i].m_color.b * weight;
      }

      const uint half_total_weight = total_weight >> 1;
      uint ave_r = static_cast<uint>((tot_r + half_total_weight) / total_weight);
      uint ave_g = static_cast<uint>((tot_g + half_total_weight) / total_weight);
      uint ave_b = static_cast<uint>((tot_b + half_total_weight) / total_weight);

      uint low_color = (ryg_dxt::OMatch5[ave_r][0]<<11) | (ryg_dxt::OMatch6[ave_g][0]<<5) | ryg_dxt::OMatch5[ave_b][0];
      uint high_color = (ryg_dxt::OMatch5[ave_r][1]<<11) | (ryg_dxt::OMatch6[ave_g][1]<<5) | ryg_dxt::OMatch5[ave_b][1];
      bool improved = evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);

      if ((m_pParams->m_use_alpha_blocks) && (m_best_solution.m_error))
      {
         low_color = (ryg_dxt::OMatch5_3[ave_r][0]<<11) | (ryg_dxt::OMatch6_3[ave_g][0]<<5) | ryg_dxt::OMatch5_3[ave_b][0];
         high_color = (ryg_dxt::OMatch5_3[ave_r][1]<<11) | (ryg_dxt::OMatch6_3[ave_g][1]<<5) | ryg_dxt::OMatch5_3[ave_b][1];
         improved |= evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);
      }

      if (m_pParams->m_quality == cCRNDXTQualityUber)
      {
         // Try compressing as all-solid using the other (non-average) colors in the block in uber.
         for (uint i = 0; i < m_unique_colors.size(); i++)
         {
            uint r = m_unique_colors[i].m_color[0];
            uint g = m_unique_colors[i].m_color[1];
            uint b = m_unique_colors[i].m_color[2];
            if ((r == ave_r) && (g == ave_g) && (b == ave_b))
               continue;

            uint low_color = (ryg_dxt::OMatch5[r][0]<<11) | (ryg_dxt::OMatch6[g][0]<<5) | ryg_dxt::OMatch5[b][0];
            uint high_color = (ryg_dxt::OMatch5[r][1]<<11) | (ryg_dxt::OMatch6[g][1]<<5) | ryg_dxt::OMatch5[b][1];
            improved |= evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);

            if ((m_pParams->m_use_alpha_blocks) && (m_best_solution.m_error))
            {
               low_color = (ryg_dxt::OMatch5_3[r][0]<<11) | (ryg_dxt::OMatch6_3[g][0]<<5) | ryg_dxt::OMatch5_3[b][0];
               high_color = (ryg_dxt::OMatch5_3[r][1]<<11) | (ryg_dxt::OMatch6_3[g][1]<<5) | ryg_dxt::OMatch5_3[b][1];
               improved |= evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);
            }
         }
      }

      return improved;
   }

   // Block is solid, trying using ryg's optimal single color tables.
   bool dxt1_endpoint_optimizer::handle_solid_block()
   {
      int r = m_unique_colors[0].m_color.r;
      int g = m_unique_colors[0].m_color.g;
      int b = m_unique_colors[0].m_color.b;

      //uint packed_color = dxt1_block::pack_color(r, g, b, true);
      //evaluate_solution(dxt1_solution_coordinates((uint16)packed_color, (uint16)packed_color), false, &m_best_solution);

      uint low_color = (ryg_dxt::OMatch5[r][0]<<11) | (ryg_dxt::OMatch6[g][0]<<5) | ryg_dxt::OMatch5[b][0];
      uint high_color = (ryg_dxt::OMatch5[r][1]<<11) | (ryg_dxt::OMatch6[g][1]<<5) | ryg_dxt::OMatch5[b][1];
      evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), false, &m_best_solution);

      if ((m_pParams->m_use_alpha_blocks) && (m_best_solution.m_error))
      {
         low_color = (ryg_dxt::OMatch5_3[r][0]<<11) | (ryg_dxt::OMatch6_3[g][0]<<5) | ryg_dxt::OMatch5_3[b][0];
         high_color = (ryg_dxt::OMatch5_3[r][1]<<11) | (ryg_dxt::OMatch6_3[g][1]<<5) | ryg_dxt::OMatch5_3[b][1];
         evaluate_solution(dxt1_solution_coordinates((uint16)low_color, (uint16)high_color), true, &m_best_solution);
      }

      return_solution(*m_pResults, m_best_solution);

      return true;
   }

   void dxt1_endpoint_optimizer::compute_vectors(const vec3F& perceptual_weights)
   {
      m_norm_unique_colors.resize(0);
      m_norm_unique_colors_weighted.resize(0);

      m_mean_norm_color.clear();
      m_mean_norm_color_weighted.clear();

      for (uint i = 0; i < m_unique_colors.size(); i++)
      {
         const color_quad_u8& color = m_unique_colors[i].m_color;
         const uint weight = m_unique_colors[i].m_weight;

         vec3F norm_color(color.r * 1.0f/255.0f, color.g * 1.0f/255.0f, color.b * 1.0f/255.0f);
         vec3F norm_color_weighted(vec3F::mul_components(perceptual_weights, norm_color));

         m_norm_unique_colors.push_back(norm_color);
         m_norm_unique_colors_weighted.push_back(norm_color_weighted);

         m_mean_norm_color += norm_color * (float)weight;
         m_mean_norm_color_weighted += norm_color_weighted * (float)weight;
      }

      if (m_total_unique_color_weight)
      {
         m_mean_norm_color *= (1.0f / m_total_unique_color_weight);
         m_mean_norm_color_weighted *= (1.0f / m_total_unique_color_weight);
      }

      for (uint i = 0; i < m_unique_colors.size(); i++)
      {
         m_norm_unique_colors[i] -= m_mean_norm_color;
         m_norm_unique_colors_weighted[i] -= m_mean_norm_color_weighted;
      }
   }

   // Compute PCA (principle axis, i.e. direction of largest variance) of input vectors.
   void dxt1_endpoint_optimizer::compute_pca(vec3F& axis, const vec3F_array& norm_colors, const vec3F& def)
   {
#if 0
      axis.clear();

      CRNLIB_ASSERT(m_unique_colors.size() == norm_colors.size());

      // Incremental PCA
      bool first = true;
      for (uint i = 0; i < norm_colors.size(); i++)
      {
         const uint weight = m_unique_colors[i].m_weight;

         for (uint j = 0; j < weight; j++)
         {
            vec3F x(norm_colors[i] * norm_colors[i][0]);
            vec3F y(norm_colors[i] * norm_colors[i][1]);
            vec3F z(norm_colors[i] * norm_colors[i][2]);

            vec3F v(first ? norm_colors[0] : axis);
            first = false;

            v.normalize(&def);

            axis[0] += (x * v);
            axis[1] += (y * v);
            axis[2] += (z * v);
         }
      }

      axis.normalize(&def);
#else
      double cov[6] = { 0, 0, 0, 0, 0, 0 };

      //vec3F lo(math::cNearlyInfinite);
      //vec3F hi(-math::cNearlyInfinite);

      for(uint i = 0; i < norm_colors.size(); i++)
      {
         const vec3F& v = norm_colors[i];

         //if (v[0] < lo[0]) lo[0] = v[0];
         //if (v[1] < lo[1]) lo[1] = v[1];
         //if (v[2] < lo[2]) lo[2] = v[2];
         //if (v[0] > hi[0]) hi[0] = v[0];
         //if (v[1] > hi[1]) hi[1] = v[1];
         //if (v[2] > hi[2]) hi[2] = v[2];

         float r = v[0];
         float g = v[1];
         float b = v[2];

         if (m_unique_colors[i].m_weight > 1)
         {
            const double weight = m_unique_colors[i].m_weight;

            cov[0] += r*r*weight;
            cov[1] += r*g*weight;
            cov[2] += r*b*weight;
            cov[3] += g*g*weight;
            cov[4] += g*b*weight;
            cov[5] += b*b*weight;
         }
         else
         {
            cov[0] += r*r;
            cov[1] += r*g;
            cov[2] += r*b;
            cov[3] += g*g;
            cov[4] += g*b;
            cov[5] += b*b;
         }
      }

      double vfr, vfg, vfb;
      //vfr = hi[0] - lo[0];
      //vfg = hi[1] - lo[1];
      //vfb = hi[2] - lo[2];
      // This is more stable.
      vfr = .9f;
      vfg = 1.0f;
      vfb = .7f;

      const uint cNumIters = 8;

      for (uint iter = 0; iter < cNumIters; iter++)
      {
         double r = vfr*cov[0] + vfg*cov[1] + vfb*cov[2];
         double g = vfr*cov[1] + vfg*cov[3] + vfb*cov[4];
         double b = vfr*cov[2] + vfg*cov[4] + vfb*cov[5];

         double m = math::maximum(fabs(r), fabs(g), fabs(b));
         if (m > 1e-10)
         {
            m = 1.0f / m;
            r *= m;
            g *= m;
            b *= m;
         }

         double delta = math::square(vfr-r) + math::square(vfg-g) + math::square(vfb-b);

         vfr = r;
         vfg = g;
         vfb = b;

         if ((iter > 2) && (delta < 1e-8))
            break;
      }

      double len = vfr*vfr + vfg*vfg + vfb*vfb;

      if (len < 1e-10)
      {
         axis = def;
      }
      else
      {
         len = 1.0f / sqrt(len);
         vfr *= len;
         vfg *= len;
         vfb *= len;

         axis.set(static_cast<float>(vfr), static_cast<float>(vfg), static_cast<float>(vfb));
      }
#endif
   }

   static const uint8 g_invTableNull[4] = { 0, 1, 2, 3 };
   static const uint8 g_invTableAlpha[4] = { 1, 0, 2, 3 };
   static const uint8 g_invTableColor[4] = { 1, 0, 3, 2 };

   // Computes a valid (encodable) DXT1 solution (low/high colors, swizzled selectors) from input.
   void dxt1_endpoint_optimizer::return_solution(results& res, const potential_solution& solution)
   {
      bool invert_selectors;

      if (solution.m_alpha_block)
         invert_selectors = (solution.m_coords.m_low_color > solution.m_coords.m_high_color);
      else
      {
         CRNLIB_ASSERT(solution.m_coords.m_low_color != solution.m_coords.m_high_color);

         invert_selectors = (solution.m_coords.m_low_color < solution.m_coords.m_high_color);
      }

      if (invert_selectors)
      {
         res.m_low_color = solution.m_coords.m_high_color;
         res.m_high_color = solution.m_coords.m_low_color;
      }
      else
      {
         res.m_low_color = solution.m_coords.m_low_color;
         res.m_high_color = solution.m_coords.m_high_color;
      }

      const uint8* pInvert_table = g_invTableNull;
      if (invert_selectors)
         pInvert_table = solution.m_alpha_block ? g_invTableAlpha : g_invTableColor;

      const uint alpha_thresh = m_pParams->m_pixels_have_alpha ? (m_pParams->m_dxt1a_alpha_threshold << 24U) : 0;

      const uint32* pSrc_pixels = reinterpret_cast<const uint32*>(m_pParams->m_pPixels);
      uint8* pDst_selectors = res.m_pSelectors;

      if ((m_unique_colors.size() == 1) && (!m_pParams->m_pixels_have_alpha))
      {
         uint32 c = utils::read_le32(pSrc_pixels);

         CRNLIB_ASSERT(c >= alpha_thresh);

         c |= 0xFF000000U;

         unique_color_hash_map::const_iterator it(m_unique_color_hash_map.find(c));
         CRNLIB_ASSERT(it != m_unique_color_hash_map.end());

         uint unique_color_index = it->second;

         uint selector = pInvert_table[solution.m_selectors[unique_color_index]];

         memset(pDst_selectors, selector, m_pParams->m_num_pixels);
      }
      else
      {
         uint8* pDst_selectors_end = pDst_selectors + m_pParams->m_num_pixels;

         uint8 prev_selector = 0;
         uint32 prev_color = 0;

         do
         {
            uint32 c = utils::read_le32(pSrc_pixels);
            pSrc_pixels++;

            uint8 selector = 3;

            if (c >= alpha_thresh)
            {
               c |= 0xFF000000U;

               if (c == prev_color)
                  selector = prev_selector;
               else
               {
                  unique_color_hash_map::const_iterator it(m_unique_color_hash_map.find(c));

                  CRNLIB_ASSERT(it != m_unique_color_hash_map.end());

                  uint unique_color_index = it->second;

                  selector = pInvert_table[solution.m_selectors[unique_color_index]];

                  prev_color = c;
                  prev_selector = selector;
               }
            }

            *pDst_selectors++ = selector;

         } while (pDst_selectors != pDst_selectors_end);
      }

      res.m_alpha_block = solution.m_alpha_block;
      res.m_error = solution.m_error;
   }

   inline vec3F dxt1_endpoint_optimizer::unpack_to_vec3F(uint16 packed_color)
   {
      color_quad_u8 c(dxt1_block::unpack_color(packed_color, false));

      return vec3F(c.r * 1.0f/31.0f, c.g * 1.0f/63.0f, c.b * 1.0f/31.0f);
   }

   inline vec3F dxt1_endpoint_optimizer::unpack_to_vec3F_raw(uint16 packed_color)
   {
      color_quad_u8 c(dxt1_block::unpack_color(packed_color, false));

      return vec3F(c.r, c.g, c.b);
   }

   // Per-component 1D endpoint optimization.
   void dxt1_endpoint_optimizer::optimize_endpoint_comps()
   {
      if ((m_best_solution.m_alpha_block) || (!m_best_solution.m_error))
         return;

      //color_quad_u8 orig_l(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));
      //color_quad_u8 orig_h(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));
      //uint orig_error = m_best_solution.m_error;

      color_quad_u8 orig_l_scaled(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, true));
      color_quad_u8 orig_h_scaled(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, true));

      color_quad_u8 min_color(0xFF, 0xFF, 0xFF, 0xFF);
      color_quad_u8 max_color(0, 0, 0, 0);
      for (uint i = 0; i < m_unique_colors.size(); i++)
      {
         min_color = color_quad_u8::component_min(min_color, m_unique_colors[i].m_color);
         max_color = color_quad_u8::component_max(max_color, m_unique_colors[i].m_color);
      }

      // Try to separately optimize each component. This is a 1D problem so it's easy to compute accurate per-component error bounds.
      for (uint comp_index = 0; comp_index < 3; comp_index++)
      {
         uint ll[4];
         ll[0] = orig_l_scaled[comp_index];
         ll[1] = orig_h_scaled[comp_index];
         ll[2] = (ll[0]*2+ll[1])/3;
         ll[3] = (ll[0]+ll[1]*2)/3;

         uint error_to_beat = 0;
         uint min_color_weight = 0;
         uint max_color_weight = 0;
         for (uint i = 0; i < m_unique_colors.size(); i++)
         {
            uint c = m_unique_colors[i].m_color[comp_index];
            uint w = m_unique_colors[i].m_weight;

            int delta = ll[m_best_solution.m_selectors[i]] - c;
            error_to_beat += (int)w * (delta * delta);

            if (c == min_color[comp_index])
               min_color_weight += w;
            if (c == max_color[comp_index])
               max_color_weight += w;
         }

         if (!error_to_beat)
            continue;

         CRNLIB_ASSERT((min_color_weight > 0) && (max_color_weight > 0));
         const uint error_to_beat_div_min_color_weight = min_color_weight ? ((error_to_beat + min_color_weight - 1) / min_color_weight) : 0;
         const uint error_to_beat_div_max_color_weight = max_color_weight ? ((error_to_beat + max_color_weight - 1) / max_color_weight) : 0;

         const uint m = (comp_index == 1) ? 63 : 31;
         const uint m_shift = (comp_index == 1) ? 3 : 2;

         for (uint o = 0; o <= m; o++)
         {
            uint tl[4];

            tl[0] = (comp_index == 1) ? ((o << 2) | (o >> 4)) : ((o << 3) | (o >> 2));

            for (uint h = 0; h < 8; h++)
            {
               const uint pl = h << m_shift;
               const uint ph = ((h + 1) << m_shift) - 1;

               uint tl_l = (comp_index == 1) ? ((pl << 2) | (pl >> 4)) : ((pl << 3) | (pl >> 2));
               uint tl_h = (comp_index == 1) ? ((ph << 2) | (ph >> 4)) : ((ph << 3) | (ph >> 2));

               tl_l = math::minimum(tl_l, tl[0]);
               tl_h = math::maximum(tl_h, tl[0]);

               uint c_l = min_color[comp_index];
               uint c_h = max_color[comp_index];

               if (c_h < tl_l)
               {
                  uint min_possible_error = math::square<int>(tl_l - c_l);
                  if (min_possible_error > error_to_beat_div_min_color_weight)
                     continue;
               }
               else if (c_l > tl_h)
               {
                  uint min_possible_error = math::square<int>(c_h - tl_h);
                  if (min_possible_error > error_to_beat_div_max_color_weight)
                     continue;
               }

               for (uint p = pl; p <= ph; p++)
               {
                  tl[1] = (comp_index == 1) ? ((p << 2) | (p >> 4)) : ((p << 3) | (p >> 2));

                  tl[2] = (tl[0]*2+tl[1])/3;
                  tl[3] = (tl[0]+tl[1]*2)/3;

                  uint trial_error = 0;
                  for (uint i = 0; i < m_unique_colors.size(); i++)
                  {
                     int delta = tl[m_best_solution.m_selectors[i]] - m_unique_colors[i].m_color[comp_index];
                     trial_error += m_unique_colors[i].m_weight * (delta * delta);
                     if (trial_error >= error_to_beat)
                        break;
                  }

                  //CRNLIB_ASSERT(trial_error >= min_possible_error);

                  if (trial_error < error_to_beat)
                  {
                     color_quad_u8 l(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));
                     color_quad_u8 h(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));
                     l[comp_index] = static_cast<uint8>(o);
                     h[comp_index] = static_cast<uint8>(p);

                     bool better = evaluate_solution(
                        dxt1_solution_coordinates(dxt1_block::pack_color(l, false), dxt1_block::pack_color(h, false)),
                        true, &m_best_solution);
                     better;

                     if (better)
                     {
#if 0
                        printf("comp: %u, orig: %u %u, new: %u %u, orig_error: %u, new_error: %u\n", comp_index,
                           orig_l[comp_index], orig_h[comp_index],
                           l[comp_index], h[comp_index],
                           orig_error, m_best_solution.m_error);
#endif
                        if (!m_best_solution.m_error)
                           return;

                        error_to_beat = 0;
                        for (uint i = 0; i < m_unique_colors.size(); i++)
                        {
                           int delta = tl[m_best_solution.m_selectors[i]] - m_unique_colors[i].m_color[comp_index];
                           error_to_beat += m_unique_colors[i].m_weight * (delta * delta);
                        }

                     } // better

                     //goto early_out;
                  } // if (trial_error < error_to_beat)

               } // for (uint p = 0; p <= m; p++)
            }

         } // for (uint o = 0; o <= m; o++)

      } // comp_index
   }

   // Voxel adjacency delta coordinations.
   static const struct adjacent_coords
   {
      int8 x, y, z;
   } g_adjacency[26] = {
      {-1, -1, -1},
      {0, -1, -1},
      {1, -1, -1},
      {-1, 0, -1},
      {0, 0, -1},
      {1, 0, -1},
      {-1, 1, -1},
      {0, 1, -1},

      {1, 1, -1},
      {-1, -1, 0},
      {0, -1, 0},
      {1, -1, 0},
      {-1, 0, 0},
      {1, 0, 0},
      {-1, 1, 0},
      {0, 1, 0},

      {1, 1, 0},
      {-1, -1, 1},
      {0, -1, 1},
      {1, -1, 1},
      {-1, 0, 1},
      {0, 0, 1},
      {1, 0, 1},
      {-1, 1, 1},

      {0, 1, 1},
      {1, 1, 1}
   };

   // Attempt to refine current solution's endpoints given the current selectors using least squares.
   bool dxt1_endpoint_optimizer::refine_solution(int refinement_level)
   {
      CRNLIB_ASSERT(m_best_solution.m_valid);

      static const int w1Tab[4] = { 3,0,2,1 };

      static const int prods_0[4] = { 0x00,0x00,0x02,0x02 };
      static const int prods_1[4] = { 0x00,0x09,0x01,0x04 };
      static const int prods_2[4] = { 0x09,0x00,0x04,0x01 };

      double akku_0 = 0;
      double akku_1 = 0;
      double akku_2 = 0;
      double At1_r, At1_g, At1_b;
      double At2_r, At2_g, At2_b;

      At1_r = At1_g = At1_b = 0;
      At2_r = At2_g = At2_b = 0;
      for(uint i = 0; i < m_unique_colors.size(); i++)
      {
         const color_quad_u8& c = m_unique_colors[i].m_color;
         const double weight = m_unique_colors[i].m_weight;

         double r = c.r*weight;
         double g = c.g*weight;
         double b = c.b*weight;
         int step = m_best_solution.m_selectors[i]^1;

         int w1 = w1Tab[step];

         akku_0  += prods_0[step]*weight;
         akku_1  += prods_1[step]*weight;
         akku_2  += prods_2[step]*weight;
         At1_r   += w1*r;
         At1_g   += w1*g;
         At1_b   += w1*b;
         At2_r   += r;
         At2_g   += g;
         At2_b   += b;
      }

      At2_r = 3*At2_r - At1_r;
      At2_g = 3*At2_g - At1_g;
      At2_b = 3*At2_b - At1_b;

      double xx = akku_2;
      double yy = akku_1;
      double xy = akku_0;

      double t = xx * yy - xy * xy;
      if (!yy || !xx || (fabs(t) < .0000125f))
         return false;

      double frb = (3.0f * 31.0f / 255.0f) / t;
      double fg = frb * (63.0f / 31.0f);

      bool improved = false;

      if (refinement_level == 0)
      {
         uint max16;
         max16 =   math::clamp<int>(static_cast<int>((At1_r*yy - At2_r*xy)*frb+0.5f),0,31) << 11;
         max16 |=  math::clamp<int>(static_cast<int>((At1_g*yy - At2_g*xy)*fg +0.5f),0,63) << 5;
         max16 |=  math::clamp<int>(static_cast<int>((At1_b*yy - At2_b*xy)*frb+0.5f),0,31) << 0;

         uint min16;
         min16 =   math::clamp<int>(static_cast<int>((At2_r*xx - At1_r*xy)*frb+0.5f),0,31) << 11;
         min16 |=  math::clamp<int>(static_cast<int>((At2_g*xx - At1_g*xy)*fg +0.5f),0,63) << 5;
         min16 |=  math::clamp<int>(static_cast<int>((At2_b*xx - At1_b*xy)*frb+0.5f),0,31) << 0;

         dxt1_solution_coordinates nc((uint16)min16, (uint16)max16);
         nc.canonicalize();
         improved |= evaluate_solution(nc, true, &m_best_solution, false);
      }
      else if (refinement_level == 1)
      {
         // Try exploring the local lattice neighbors of the least squares optimized result.
         color_quad_u8 e[2];

         e[0].clear();
         e[0][0] = (uint8)math::clamp<int>(static_cast<int>((At1_r*yy - At2_r*xy)*frb+0.5f),0,31);
         e[0][1] = (uint8)math::clamp<int>(static_cast<int>((At1_g*yy - At2_g*xy)*fg +0.5f),0,63);
         e[0][2] = (uint8)math::clamp<int>(static_cast<int>((At1_b*yy - At2_b*xy)*frb+0.5f),0,31);

         e[1].clear();
         e[1][0] = (uint8)math::clamp<int>(static_cast<int>((At2_r*xx - At1_r*xy)*frb+0.5f),0,31);
         e[1][1] = (uint8)math::clamp<int>(static_cast<int>((At2_g*xx - At1_g*xy)*fg +0.5f),0,63);
         e[1][2] = (uint8)math::clamp<int>(static_cast<int>((At2_b*xx - At1_b*xy)*frb+0.5f),0,31);

         for (uint i = 0; i < 2; i++)
         {
            for (int rr = -1; rr <= 1; rr++)
            {
               for (int gr = -1; gr <= 1; gr++)
               {
                  for (int br = -1; br <= 1; br++)
                  {
                     dxt1_solution_coordinates nc;

                     color_quad_u8 c[2];
                     c[0] = e[0];
                     c[1] = e[1];

                     c[i][0] = (uint8)math::clamp<int>(c[i][0] + rr, 0, 31);
                     c[i][1] = (uint8)math::clamp<int>(c[i][1] + gr, 0, 63);
                     c[i][2] = (uint8)math::clamp<int>(c[i][2] + br, 0, 31);

                     nc.m_low_color = dxt1_block::pack_color(c[0], false);
                     nc.m_high_color = dxt1_block::pack_color(c[1], false);

                     nc.canonicalize();

                     if ((nc.m_low_color != m_best_solution.m_coords.m_low_color) || (nc.m_high_color != m_best_solution.m_coords.m_high_color))
                     {
                        improved |= evaluate_solution(nc, true, &m_best_solution, false);
                     }
                  }
               }
            }
         }
      }
      else
      {
         // Try even harder to explore the local lattice neighbors of the least squares optimized result.
         color_quad_u8 e[2];
         e[0].clear();
         e[0][0] = (uint8)math::clamp<int>(static_cast<int>((At1_r*yy - At2_r*xy)*frb+0.5f),0,31);
         e[0][1] = (uint8)math::clamp<int>(static_cast<int>((At1_g*yy - At2_g*xy)*fg +0.5f),0,63);
         e[0][2] = (uint8)math::clamp<int>(static_cast<int>((At1_b*yy - At2_b*xy)*frb+0.5f),0,31);

         e[1].clear();
         e[1][0] = (uint8)math::clamp<int>(static_cast<int>((At2_r*xx - At1_r*xy)*frb+0.5f),0,31);
         e[1][1] = (uint8)math::clamp<int>(static_cast<int>((At2_g*xx - At1_g*xy)*fg +0.5f),0,63);
         e[1][2] = (uint8)math::clamp<int>(static_cast<int>((At2_b*xx - At1_b*xy)*frb+0.5f),0,31);

         for (int orr = -1; orr <= 1; orr++)
         {
            for (int ogr = -1; ogr <= 1; ogr++)
            {
               for (int obr = -1; obr <= 1; obr++)
               {
                  dxt1_solution_coordinates nc;

                  color_quad_u8 c[2];
                  c[0] = e[0];
                  c[1] = e[1];

                  c[0][0] = (uint8)math::clamp<int>(c[0][0] + orr, 0, 31);
                  c[0][1] = (uint8)math::clamp<int>(c[0][1] + ogr, 0, 63);
                  c[0][2] = (uint8)math::clamp<int>(c[0][2] + obr, 0, 31);

                  for (int rr = -1; rr <= 1; rr++)
                  {
                     for (int gr = -1; gr <= 1; gr++)
                     {
                        for (int br = -1; br <= 1; br++)
                        {
                           c[1][0] = (uint8)math::clamp<int>(c[1][0] + rr, 0, 31);
                           c[1][1] = (uint8)math::clamp<int>(c[1][1] + gr, 0, 63);
                           c[1][2] = (uint8)math::clamp<int>(c[1][2] + br, 0, 31);

                           nc.m_low_color = dxt1_block::pack_color(c[0], false);
                           nc.m_high_color = dxt1_block::pack_color(c[1], false);
                           nc.canonicalize();

                           improved |= evaluate_solution(nc, true, &m_best_solution, false);
                        }
                     }
                  }
               }
            }
         }
      }

      return improved;
   }

   //-----------------------------------------------------------------------------------------------------------------------------------------

   // Primary endpoint optimization entrypoint.
   bool dxt1_endpoint_optimizer::optimize_endpoints(vec3F& low_color, vec3F& high_color)
   {
      vec3F orig_low_color(low_color);
      vec3F orig_high_color(high_color);

      m_trial_solution.clear();

      uint num_passes;
      const int16* pProbe_table = g_uber_probe_table;
      uint probe_range;
      float dist_per_trial = .015625f;

      // How many probes, and the distance between each probe depends on the quality level.
      switch (m_pParams->m_quality)
      {
         case cCRNDXTQualitySuperFast:
            pProbe_table = g_fast_probe_table;
            probe_range = cFastProbeTableSize;
            dist_per_trial = .027063293f;
            num_passes = 1;
            break;
         case cCRNDXTQualityFast:
            pProbe_table = g_fast_probe_table;
            probe_range = cFastProbeTableSize;
            dist_per_trial = .027063293f;
            num_passes = 2;
            break;
         case cCRNDXTQualityNormal:
            pProbe_table = g_normal_probe_table;
            probe_range = cNormalProbeTableSize;
            dist_per_trial = .027063293f;
            num_passes = 2;
            break;
         case cCRNDXTQualityBetter:
            pProbe_table = g_better_probe_table;
            probe_range = cBetterProbeTableSize;
            num_passes = 2;
            break;
         default:
            pProbe_table = g_uber_probe_table;
            probe_range = cUberProbeTableSize;
            num_passes = 4;
            break;
      }

      m_solutions_tried.reset();

      if (m_pParams->m_endpoint_caching)
      {
         // Try the previous X winning endpoints. This may not give us optimal results, but it may increase the probability of early outs while evaluating potential solutions.
         const uint num_prev_results = math::minimum<uint>(cMaxPrevResults, m_num_prev_results);
         for (uint i = 0; i < num_prev_results; i++)
         {
            const dxt1_solution_coordinates& coords = m_prev_results[i];

            solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
            if (!solution_res.second)
               continue;

            evaluate_solution(coords, true, &m_best_solution);
         }

         if (!m_best_solution.m_error)
         {
            // Got lucky - one of the previous endpoints is optimal.
            return_solution(*m_pResults, m_best_solution);
            return true;
         }
      }

      if (m_pParams->m_quality >= cCRNDXTQualityBetter)
      {
         //evaluate_solution(dxt1_solution_coordinates(low_color, high_color), true, &m_best_solution);
         //refine_solution();

         try_median4(orig_low_color, orig_high_color);
      }

      uint probe_low[cUberProbeTableSize * 2 + 1];
      uint probe_high[cUberProbeTableSize * 2 + 1];

      vec3F scaled_principle_axis[2];

      scaled_principle_axis[1] = m_principle_axis * dist_per_trial;
      scaled_principle_axis[1][0] *= 31.0f;
      scaled_principle_axis[1][1] *= 63.0f;
      scaled_principle_axis[1][2] *= 31.0f;

      scaled_principle_axis[0] = -scaled_principle_axis[1];

      //vec3F initial_ofs(scaled_principle_axis * (float)-probe_range);
      //initial_ofs[0] += .5f;
      //initial_ofs[1] += .5f;
      //initial_ofs[2] += .5f;

      low_color[0] = math::clamp(low_color[0] * 31.0f, 0.0f, 31.0f);
      low_color[1] = math::clamp(low_color[1] * 63.0f, 0.0f, 63.0f);
      low_color[2] = math::clamp(low_color[2] * 31.0f, 0.0f, 31.0f);

      high_color[0] = math::clamp(high_color[0] * 31.0f, 0.0f, 31.0f);
      high_color[1] = math::clamp(high_color[1] * 63.0f, 0.0f, 63.0f);
      high_color[2] = math::clamp(high_color[2] * 31.0f, 0.0f, 31.0f);

      for (uint pass = 0; pass < num_passes; pass++)
      {
         // Now separately sweep or probe the low and high colors along the principle axis, both positively and negatively.
         // This results in two arrays of candidate low/high endpoints. Every unique combination of candidate endpoints is tried as a potential solution.
         // In higher quality modes, the various nearby lattice neighbors of each candidate endpoint are also explored, which allows the current solution to "wobble" or "migrate"
         // to areas with lower error.
         // This entire process can be repeated up to X times (depending on the quality level) until a local minimum is established.
         // This method is very stable and scalable. It could be implemented more elegantly, but I'm now very cautious of touching this code.
         if (pass)
         {
            low_color = unpack_to_vec3F_raw(m_best_solution.m_coords.m_low_color);
            high_color = unpack_to_vec3F_raw(m_best_solution.m_coords.m_high_color);
         }

         const uint64 prev_best_error = m_best_solution.m_error;
         if (!prev_best_error)
            break;

         // Sweep low endpoint along principle axis, record positions
         int prev_packed_color[2] = { -1, -1 };
         uint num_low_trials = 0;
         vec3F initial_probe_low_color(low_color + vec3F(.5f));
         for (uint i = 0; i < probe_range; i++)
         {
            const int ls = i ? 0 : 1;
            int x = pProbe_table[i];

            for (int s = ls; s < 2; s++)
            {
               vec3F probe_low_color(initial_probe_low_color + scaled_principle_axis[s] * (float)x);

               int r = math::clamp((int)floor(probe_low_color[0]), 0, 31);
               int g = math::clamp((int)floor(probe_low_color[1]), 0, 63);
               int b = math::clamp((int)floor(probe_low_color[2]), 0, 31);

               int packed_color = b | (g << 5U) | (r << 11U);
               if (packed_color != prev_packed_color[s])
               {
                  probe_low[num_low_trials++] = packed_color;
                  prev_packed_color[s] = packed_color;
               }
            }
         }

         prev_packed_color[0] = -1;
         prev_packed_color[1] = -1;

         // Sweep high endpoint along principle axis, record positions
         uint num_high_trials = 0;
         vec3F initial_probe_high_color(high_color + vec3F(.5f));
         for (uint i = 0; i < probe_range; i++)
         {
            const int ls = i ? 0 : 1;
            int x = pProbe_table[i];

            for (int s = ls; s < 2; s++)
            {
               vec3F probe_high_color(initial_probe_high_color + scaled_principle_axis[s] * (float)x);

               int r = math::clamp((int)floor(probe_high_color[0]), 0, 31);
               int g = math::clamp((int)floor(probe_high_color[1]), 0, 63);
               int b = math::clamp((int)floor(probe_high_color[2]), 0, 31);

               int packed_color = b | (g << 5U) | (r << 11U);
               if (packed_color != prev_packed_color[s])
               {
                  probe_high[num_high_trials++] = packed_color;
                  prev_packed_color[s] = packed_color;
               }
            }
         }

         // Now try all unique combinations.
         for (uint i = 0; i < num_low_trials; i++)
         {
            for (uint j = 0; j < num_high_trials; j++)
            {
               dxt1_solution_coordinates coords((uint16)probe_low[i], (uint16)probe_high[j]);
               coords.canonicalize();

               solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
               if (!solution_res.second)
                  continue;

               evaluate_solution(coords, true, &m_best_solution);
            }
         }

         if (m_pParams->m_quality >= cCRNDXTQualityNormal)
         {
            // Generate new candidates by exploring the low color's direct lattice neighbors
            color_quad_u8 lc(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));

            for (int i = 0; i < 26; i++)
            {
               int r = lc.r + g_adjacency[i].x;
               if ((r < 0) || (r > 31)) continue;

               int g = lc.g + g_adjacency[i].y;
               if ((g < 0) || (g > 63)) continue;

               int b = lc.b + g_adjacency[i].z;
               if ((b < 0) || (b > 31)) continue;

               dxt1_solution_coordinates coords(dxt1_block::pack_color(r, g, b, false), m_best_solution.m_coords.m_high_color);
               coords.canonicalize();

               solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
               if (solution_res.second)
                  evaluate_solution(coords, true, &m_best_solution);
            }

            if (m_pParams->m_quality == cCRNDXTQualityUber)
            {
               // Generate new candidates by exploring the low color's direct lattice neighbors - this time, explore much further separately on each axis.
               lc = dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false);

               for (int a = 0; a < 3; a++)
               {
                  int limit = (a == 1) ? 63 : 31;

                  for (int s = -2; s <= 2; s += 4)
                  {
                     color_quad_u8 c(lc);
                     int q = c[a] + s;
                     if ((q < 0) || (q > limit)) continue;

                     c[a] = (uint8)q;

                     dxt1_solution_coordinates coords(dxt1_block::pack_color(c, false), m_best_solution.m_coords.m_high_color);
                     coords.canonicalize();

                     solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
                     if (solution_res.second)
                        evaluate_solution(coords, true, &m_best_solution);
                  }
               }
            }

            // Generate new candidates by exploring the high color's direct lattice neighbors
            color_quad_u8 hc(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));

            for (int i = 0; i < 26; i++)
            {
               int r = hc.r + g_adjacency[i].x;
               if ((r < 0) || (r > 31)) continue;

               int g = hc.g + g_adjacency[i].y;
               if ((g < 0) || (g > 63)) continue;

               int b = hc.b + g_adjacency[i].z;
               if ((b < 0) || (b > 31)) continue;

               dxt1_solution_coordinates coords(m_best_solution.m_coords.m_low_color, dxt1_block::pack_color(r, g, b, false));
               coords.canonicalize();

               solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
               if (solution_res.second)
                  evaluate_solution(coords, true, &m_best_solution);
            }

            if (m_pParams->m_quality == cCRNDXTQualityUber)
            {
               // Generate new candidates by exploring the high color's direct lattice neighbors - this time, explore much further separately on each axis.
               hc = dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false);

               for (int a = 0; a < 3; a++)
               {
                  int limit = (a == 1) ? 63 : 31;

                  for (int s = -2; s <= 2; s += 4)
                  {
                     color_quad_u8 c(hc);
                     int q = c[a] + s;
                     if ((q < 0) || (q > limit)) continue;

                     c[a] = (uint8)q;

                     dxt1_solution_coordinates coords(m_best_solution.m_coords.m_low_color, dxt1_block::pack_color(c, false));
                     coords.canonicalize();

                     solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
                     if (solution_res.second)
                        evaluate_solution(coords, true, &m_best_solution);
                  }
               }
            }
         }

         if ((!m_best_solution.m_error) || ((pass) && (m_best_solution.m_error == prev_best_error)))
            break;

         if (m_pParams->m_quality >= cCRNDXTQualityUber)
         {
            // Attempt to refine current solution's endpoints given the current selectors using least squares.
            refine_solution(1);
         }
      }

      if (m_pParams->m_quality >= cCRNDXTQualityNormal)
      {
         if ((m_best_solution.m_error) && (!m_pParams->m_pixels_have_alpha))
         {
            bool choose_solid_block = false;
            if (m_best_solution.are_selectors_all_equal())
            {
               // All selectors equal - try various solid-block optimizations
               choose_solid_block = try_average_block_as_solid();
            }

            if ((!choose_solid_block) && (m_pParams->m_quality == cCRNDXTQualityUber))
            {
               // Per-component 1D endpoint optimization.
               optimize_endpoint_comps();
            }
         }

         if (m_pParams->m_quality == cCRNDXTQualityUber)
         {
            if (m_best_solution.m_error)
            {
               // The pixels may have already been DXTc compressed by another compressor.
               // It's usually possible to recover the endpoints used to previously pack the block.
               try_combinatorial_encoding();
            }
         }
      }

      return_solution(*m_pResults, m_best_solution);

      if (m_pParams->m_endpoint_caching)
      {
         // Remember result for later reruse.
         m_prev_results[m_num_prev_results & (cMaxPrevResults - 1)] = m_best_solution.m_coords;
         m_num_prev_results++;
      }

      return true;
   }

   static inline int mul_8bit(int a, int b)
   {
      int t = a * b + 128;
      return (t + (t >> 8)) >> 8;
   }

   bool dxt1_endpoint_optimizer::handle_multicolor_block()
   {
      uint num_passes = 1;
      vec3F perceptual_weights(1.0f);

      if (m_perceptual)
      {
         // Compute RGB weighting for use in perceptual mode.
         // The more saturated the block, the more the weights deviate from (1,1,1).
         float ave_redness = 0;
         float ave_blueness = 0;
         float ave_l = 0;

         for (uint i = 0; i < m_unique_colors.size(); i++)
         {
            const color_quad_u8& c = m_unique_colors[i].m_color;
            const float weight = (float)m_unique_colors[i].m_weight;

            int l = mul_8bit(c.r + c.g + c.b, 0x55); // /3
            ave_l += l;
            l = math::maximum(1, l);

            float scale = weight / static_cast<float>(l);

            ave_redness += scale * c.r;
            ave_blueness += scale * c.b;
         }

         ave_redness /= m_total_unique_color_weight;
         ave_blueness /= m_total_unique_color_weight;
         ave_l /= m_total_unique_color_weight;

         ave_l = math::minimum(1.0f, ave_l * 16.0f / 255.0f);

         //float r = ave_l * powf(math::saturate(ave_redness / 3.0f), 5.0f);
         //float b = ave_l * powf(math::saturate(ave_blueness / 3.0f), 5.0f);

         float p = ave_l * powf(math::saturate(math::maximum(ave_redness, ave_blueness) * 1.0f/3.0f), 2.75f);

         if (p >= 1.0f)
            num_passes = 1;
         else
         {
            num_passes = 2;
            perceptual_weights = vec3F::lerp(vec3F(.212f, .72f, .072f), perceptual_weights, p);
         }
      }

      for (uint pass_index = 0; pass_index < num_passes; pass_index++)
      {
         compute_vectors(perceptual_weights);

         compute_pca(m_principle_axis, m_norm_unique_colors_weighted, vec3F(.2837149f, 0.9540631f, 0.096277453f));

#if 0
         matrix44F m(matrix44F::make_scale_matrix(perceptual_weights[0], perceptual_weights[1], perceptual_weights[2]));
         matrix44F im(m.get_inverse());
         im.transpose_in_place();
         m_principle_axis = m_principle_axis * im;
#else
         // Purposely scale the components of the principle axis by the perceptual weighting.
         // There's probably a cleaner way to go about this, but it works (more competitive in perceptual mode against nvdxt.exe or ATI_Compress).
         m_principle_axis[0] /= perceptual_weights[0];
         m_principle_axis[1] /= perceptual_weights[1];
         m_principle_axis[2] /= perceptual_weights[2];
#endif
         m_principle_axis.normalize_in_place();

         if (num_passes > 1)
         {
            // Check for obviously wild principle axes and try to compensate by backing off the component weightings.
            if (fabs(m_principle_axis[0]) >= .795f)
               perceptual_weights.set(.424f, .6f, .072f);
            else if (fabs(m_principle_axis[2]) >= .795f)
               perceptual_weights.set(.212f, .6f, .212f);
            else
               break;
         }
      }

      // Find bounds of projection onto (potentially skewed) principle axis.
      float l = 1e+9;
      float h = -1e+9;

      for (uint i = 0; i < m_norm_unique_colors.size(); i++)
      {
         float d = m_norm_unique_colors[i] * m_principle_axis;
         l = math::minimum(l, d);
         h = math::maximum(h, d);
      }

      vec3F low_color(m_mean_norm_color + l * m_principle_axis);
      vec3F high_color(m_mean_norm_color + h * m_principle_axis);

      if (!low_color.is_within_bounds(0.0f, 1.0f))
      {
         // Low color is outside the lattice, so bring it back in by casting a ray.
         vec3F coord;
         float t;
         aabb3F bounds(vec3F(0.0f), vec3F(1.0f));
         intersection::result res = intersection::ray_aabb(coord, t, ray3F(low_color, m_principle_axis), bounds);
         if (res == intersection::cSuccess)
            low_color = coord;
      }

      if (!high_color.is_within_bounds(0.0f, 1.0f))
      {
         // High color is outside the lattice, so bring it back in by casting a ray.
         vec3F coord;
         float t;
         aabb3F bounds(vec3F(0.0f), vec3F(1.0f));
         intersection::result res = intersection::ray_aabb(coord, t, ray3F(high_color, -m_principle_axis), bounds);
         if (res == intersection::cSuccess)
            high_color = coord;
      }

      // Now optimize the endpoints using the projection bounds on the (potentially skewed) principle axis as a starting point.
      if (!optimize_endpoints(low_color, high_color))
         return false;

      return true;
   }

   bool dxt1_endpoint_optimizer::handle_grayscale_block()
   {
      // TODO
      return true;
   }

   // Tries quantizing the block to 4 colors using vanilla LBG. It tries all combinations of the quantized results as potential endpoints.
   bool dxt1_endpoint_optimizer::try_median4(const vec3F& low_color, const vec3F& high_color)
   {
      vec3F means[4];

      if (m_unique_colors.size() <= 4)
      {
         for (uint i = 0; i < 4; i++)
            means[i] = m_norm_unique_colors[math::minimum<int>(m_norm_unique_colors.size() - 1, i)];
      }
      else
      {
         means[0] = low_color - m_mean_norm_color;
         means[3] = high_color - m_mean_norm_color;
         means[1] = vec3F::lerp(means[0], means[3], 1.0f/3.0f);
         means[2] = vec3F::lerp(means[0], means[3], 2.0f/3.0f);

         fast_random rm;

         const uint cMaxIters = 8;
         uint reassign_rover = 0;
         float prev_total_dist = math::cNearlyInfinite;
         for (uint iter = 0; iter < cMaxIters; iter++)
         {
            vec3F new_means[4];
            float new_weights[4];
            utils::zero_object(new_means);
            utils::zero_object(new_weights);

            float total_dist = 0;

            for (uint i = 0; i < m_unique_colors.size(); i++)
            {
               const vec3F& v = m_norm_unique_colors[i];

               float best_dist = means[0].squared_distance(v);
               int best_index = 0;

               for (uint j = 1; j < 4; j++)
               {
                  float dist = means[j].squared_distance(v);
                  if (dist < best_dist)
                  {
                     best_dist = dist;
                     best_index = j;
                  }
               }

               total_dist += best_dist;

               new_means[best_index] += v * (float)m_unique_colors[i].m_weight;
               new_weights[best_index] += (float)m_unique_colors[i].m_weight;
            }

            uint highest_index = 0;
            float highest_weight = 0;
            bool empty_cell = false;
            for (uint j = 0; j < 4; j++)
            {
               if (new_weights[j] > 0.0f)
               {
                  means[j] = new_means[j] / new_weights[j];
                  if (new_weights[j] > highest_weight)
                  {
                     highest_weight = new_weights[j];
                     highest_index = j;
                  }
               }
               else
                  empty_cell = true;
            }

            if (!empty_cell)
            {
               if (fabs(total_dist - prev_total_dist) < .00001f)
                  break;

               prev_total_dist = total_dist;
            }
            else
               prev_total_dist = math::cNearlyInfinite;

            if ((empty_cell) && (iter != (cMaxIters - 1)))
            {
               const uint ri = (highest_index + reassign_rover) & 3;
               reassign_rover++;

               for (uint j = 0; j < 4; j++)
               {
                  if (new_weights[j] == 0.0f)
                  {
                     means[j] = means[ri];
                     means[j] += vec3F::make_random(rm, -.00196f, .00196f);
                  }
               }
            }
         }
      }

      bool improved = false;

      for (uint i = 0; i < 3; i++)
      {
         for (uint j = i + 1; j < 4; j++)
         {
            const vec3F v0(means[i] + m_mean_norm_color);
            const vec3F v1(means[j] + m_mean_norm_color);

            dxt1_solution_coordinates sc(
               color_quad_u8((int)floor(.5f + v0[0] * 31.0f), (int)floor(.5f + v0[1] * 63.0f), (int)floor(.5f + v0[2] * 31.0f), 255),
               color_quad_u8((int)floor(.5f + v1[0] * 31.0f), (int)floor(.5f + v1[1] * 63.0f), (int)floor(.5f + v1[2] * 31.0f), 255), false );

            sc.canonicalize();

            improved |= evaluate_solution(sc, true, &m_best_solution, false);
         }
      }

      improved |= refine_solution((m_pParams->m_quality == cCRNDXTQualityUber) ? 1 : 0);

      return improved;
   }

   // Given candidate low/high endpoints, find the optimal selectors for 3 and 4 color blocks, compute the resulting error,
   // and use the candidate if it results in less error than the best found result so far.
   bool dxt1_endpoint_optimizer::evaluate_solution(
      const dxt1_solution_coordinates& coords,
      bool early_out,
      potential_solution* pBest_solution,
      bool alternate_rounding)
   {
      m_total_evals++;

      if ((!m_pSolutions) || (alternate_rounding))
      {
         if (m_pParams->m_quality >= cCRNDXTQualityBetter)
            return evaluate_solution_uber(m_trial_solution, coords, early_out, pBest_solution, alternate_rounding);
         else
            return evaluate_solution_fast(m_trial_solution, coords, early_out, pBest_solution, alternate_rounding);
      }

      evaluate_solution_uber(m_trial_solution, coords, false, NULL, alternate_rounding);

      CRNLIB_ASSERT(m_trial_solution.m_valid);

      // Caller has requested all considered candidate solutions for later analysis.
      m_pSolutions->resize(m_pSolutions->size() + 1);
      solution& new_solution = m_pSolutions->back();
      new_solution.m_selectors.resize(m_pParams->m_num_pixels);
      new_solution.m_results.m_pSelectors = &new_solution.m_selectors[0];

      return_solution(new_solution.m_results, m_trial_solution);

      if ((pBest_solution) && (m_trial_solution.m_error < m_best_solution.m_error))
      {
         *pBest_solution = m_trial_solution;
         return true;
      }

      return false;
   }

   inline uint dxt1_endpoint_optimizer::color_distance(bool perceptual, const color_quad_u8& e1, const color_quad_u8& e2, bool alpha)
   {
      if (perceptual)
      {
         return color::color_distance(true, e1, e2, alpha);
      }
      else if (m_pParams->m_grayscale_sampling)
      {
         // Computes error assuming shader will be converting the result to grayscale.
         int y0 = color::RGB_to_Y(e1);
         int y1 = color::RGB_to_Y(e2);
         int yd = y0  - y1;
         if (alpha)
         {
            int da = (int)e1[3] - (int)e2[3];
            return yd * yd + da * da;
         }
         else
         {
            return yd * yd;
         }
      }
      else if (m_has_color_weighting)
      {
         // Compute error using user provided color component weights.
         int dr = (int)e1[0] - (int)e2[0];
         int dg = (int)e1[1] - (int)e2[1];
         int db = (int)e1[2] - (int)e2[2];

         dr = (dr * dr) * m_pParams->m_color_weights[0];
         dg = (dg * dg) * m_pParams->m_color_weights[1];
         db = (db * db) * m_pParams->m_color_weights[2];

         if (alpha)
         {
            int da = (int)e1[3] - (int)e2[3];
            da = (da * da) * (m_pParams->m_color_weights[0] + m_pParams->m_color_weights[1] + m_pParams->m_color_weights[2]);
            return dr + dg + db + da;
         }
         else
         {
            return dr + dg + db;
         }
      }
      else
      {
         return color::color_distance(false, e1, e2, alpha);
      }
   }

   bool dxt1_endpoint_optimizer::evaluate_solution_uber(
      potential_solution& solution,
      const dxt1_solution_coordinates& coords,
      bool early_out,
      potential_solution* pBest_solution,
      bool alternate_rounding)
   {
      solution.m_coords = coords;
      solution.m_selectors.resize(m_unique_colors.size());

      if ((pBest_solution) && (early_out))
         solution.m_error = pBest_solution->m_error;
      else
         solution.m_error = cUINT64_MAX;

      solution.m_alpha_block = false;
      solution.m_valid = false;

      uint first_block_type = 0;
      uint last_block_type = 1;

      if ((m_pParams->m_pixels_have_alpha) || (m_pParams->m_force_alpha_blocks))
         first_block_type = 1;
      else if (!m_pParams->m_use_alpha_blocks)
         last_block_type = 0;

      m_trial_selectors.resize(m_unique_colors.size());

      color_quad_u8 colors[cDXT1SelectorValues];

      colors[0] = dxt1_block::unpack_color(coords.m_low_color, true);
      colors[1] = dxt1_block::unpack_color(coords.m_high_color, true);

      for (uint block_type = first_block_type; block_type <= last_block_type; block_type++)
      {
         uint64 trial_error = 0;

         if (!block_type)
         {
            colors[2].set_noclamp_rgba( (colors[0].r * 2 + colors[1].r + alternate_rounding) / 3, (colors[0].g * 2 + colors[1].g + alternate_rounding) / 3, (colors[0].b * 2 + colors[1].b + alternate_rounding) / 3, 0);
            colors[3].set_noclamp_rgba( (colors[1].r * 2 + colors[0].r + alternate_rounding) / 3, (colors[1].g * 2 + colors[0].g + alternate_rounding) / 3, (colors[1].b * 2 + colors[0].b + alternate_rounding) / 3, 0);

            if (m_perceptual)
            {
               for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
               {
                  const color_quad_u8& c = m_unique_colors[unique_color_index].m_color;

                  uint best_error = color_distance(true, c, colors[0], false);
                  uint best_color_index = 0;

                  uint err = color_distance(true, c, colors[1], false);
                  if (err < best_error) { best_error = err; best_color_index = 1; }

                  err = color_distance(true, c, colors[2], false);
                  if (err < best_error) { best_error = err; best_color_index = 2; }

                  err = color_distance(true, c, colors[3], false);
                  if (err < best_error) { best_error = err; best_color_index = 3; }

                  trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
                  if (trial_error >= solution.m_error)
                     break;

                  m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
               }
            }
            else
            {
               for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
               {
                  const color_quad_u8& c = m_unique_colors[unique_color_index].m_color;

                  uint best_error = color_distance(false, c, colors[0], false);
                  uint best_color_index = 0;

                  uint err = color_distance(false, c, colors[1], false);
                  if (err < best_error) { best_error = err; best_color_index = 1; }

                  err = color_distance(false, c, colors[2], false);
                  if (err < best_error) { best_error = err; best_color_index = 2; }

                  err = color_distance(false, c, colors[3], false);
                  if (err < best_error) { best_error = err; best_color_index = 3; }

                  trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
                  if (trial_error >= solution.m_error)
                     break;

                  m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
               }
            }
         }
         else
         {
            colors[2].set_noclamp_rgba( (colors[0].r + colors[1].r + alternate_rounding) >> 1, (colors[0].g + colors[1].g + alternate_rounding) >> 1, (colors[0].b + colors[1].b + alternate_rounding) >> 1, 255U);

            if (m_perceptual)
            {
               for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
               {
                  const color_quad_u8& c = m_unique_colors[unique_color_index].m_color;

                  uint best_error = color_distance(true, c, colors[0], false);
                  uint best_color_index = 0;

                  uint err = color_distance(true, c, colors[1], false);
                  if (err < best_error) { best_error = err; best_color_index = 1; }

                  err = color_distance(true, c, colors[2], false);
                  if (err < best_error) { best_error = err; best_color_index = 2; }

                  trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
                  if (trial_error >= solution.m_error)
                     break;

                  m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
               }
            }
            else
            {
               for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
               {
                  const color_quad_u8& c = m_unique_colors[unique_color_index].m_color;

                  uint best_error = color_distance(false, c, colors[0], false);
                  uint best_color_index = 0;

                  uint err = color_distance(false, c, colors[1], false);
                  if (err < best_error) { best_error = err; best_color_index = 1; }

                  err = color_distance(false, c, colors[2], false);
                  if (err < best_error) { best_error = err; best_color_index = 2; }

                  trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
                  if (trial_error >= solution.m_error)
                     break;

                  m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
               }
            }
         }

         if (trial_error < solution.m_error)
         {
            solution.m_error = trial_error;
            solution.m_alpha_block = (block_type != 0);
            solution.m_selectors = m_trial_selectors;
            solution.m_valid = true;
         }
      }

      if ((!solution.m_alpha_block) && (solution.m_coords.m_low_color == solution.m_coords.m_high_color))
      {
         uint s;
         if ((solution.m_coords.m_low_color & 31) != 31)
         {
            solution.m_coords.m_low_color++;
            s = 1;
         }
         else
         {
            solution.m_coords.m_high_color--;
            s = 0;
         }

         for (uint i = 0; i < m_unique_colors.size(); i++)
            solution.m_selectors[i] = static_cast<uint8>(s);
      }

      if ((pBest_solution) && (solution.m_error < pBest_solution->m_error))
      {
         *pBest_solution = solution;
         return true;
      }

      return false;
   }

   bool dxt1_endpoint_optimizer::evaluate_solution_fast(
      potential_solution& solution,
      const dxt1_solution_coordinates& coords,
      bool early_out,
      potential_solution* pBest_solution,
      bool alternate_rounding)
   {
      solution.m_coords = coords;
      solution.m_selectors.resize(m_unique_colors.size());

      if ((pBest_solution) && (early_out))
         solution.m_error = pBest_solution->m_error;
      else
         solution.m_error = cUINT64_MAX;

      solution.m_alpha_block = false;
      solution.m_valid = false;

      uint first_block_type = 0;
      uint last_block_type = 1;

      if ((m_pParams->m_pixels_have_alpha) || (m_pParams->m_force_alpha_blocks))
         first_block_type = 1;
      else if (!m_pParams->m_use_alpha_blocks)
         last_block_type = 0;

      m_trial_selectors.resize(m_unique_colors.size());

      color_quad_u8 colors[cDXT1SelectorValues];
      colors[0] = dxt1_block::unpack_color(coords.m_low_color, true);
      colors[1] = dxt1_block::unpack_color(coords.m_high_color, true);

      int vr = colors[1].r - colors[0].r;
      int vg = colors[1].g - colors[0].g;
      int vb = colors[1].b - colors[0].b;
      if (m_perceptual)
      {
         vr *= 8;
         vg *= 24;
      }

      int stops[4];
      stops[0] = colors[0].r*vr + colors[0].g*vg + colors[0].b*vb;
      stops[1] = colors[1].r*vr + colors[1].g*vg + colors[1].b*vb;

      int dirr = vr * 2;
      int dirg = vg * 2;
      int dirb = vb * 2;

      for (uint block_type = first_block_type; block_type <= last_block_type; block_type++)
      {
         uint64 trial_error = 0;

         if (!block_type)
         {
            colors[2].set_noclamp_rgba( (colors[0].r * 2 + colors[1].r + alternate_rounding) / 3, (colors[0].g * 2 + colors[1].g + alternate_rounding) / 3, (colors[0].b * 2 + colors[1].b + alternate_rounding) / 3, 255U);
            colors[3].set_noclamp_rgba( (colors[1].r * 2 + colors[0].r + alternate_rounding) / 3, (colors[1].g * 2 + colors[0].g + alternate_rounding) / 3, (colors[1].b * 2 + colors[0].b + alternate_rounding) / 3, 255U);

            stops[2] = colors[2].r*vr + colors[2].g*vg + colors[2].b*vb;
            stops[3] = colors[3].r*vr + colors[3].g*vg + colors[3].b*vb;

            // 0 2 3 1
            int c0Point = stops[1] + stops[3];
            int halfPoint = stops[3] + stops[2];
            int c3Point = stops[2] + stops[0];

            for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
            {
               const color_quad_u8& c = m_unique_colors[unique_color_index].m_color;

               int dot = c.r*dirr + c.g*dirg + c.b*dirb;

               uint8 best_color_index;
               if (dot < halfPoint)
                  best_color_index = (dot < c3Point) ? 0 : 2;
               else
                  best_color_index = (dot < c0Point) ? 3 : 1;

               uint best_error = color_distance(m_perceptual, c, colors[best_color_index], false);

               trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
               if (trial_error >= solution.m_error)
                  break;

               m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
            }
         }
         else
         {
            colors[2].set_noclamp_rgba( (colors[0].r + colors[1].r + alternate_rounding) >> 1, (colors[0].g + colors[1].g + alternate_rounding) >> 1, (colors[0].b + colors[1].b + alternate_rounding) >> 1, 255U);

            stops[2] = colors[2].r*vr + colors[2].g*vg + colors[2].b*vb;

            // 0 2 1
            int c02Point = stops[0] + stops[2];
            int c21Point = stops[2] + stops[1];

            for (int unique_color_index = (int)m_unique_colors.size() - 1; unique_color_index >= 0; unique_color_index--)
            {
               const color_quad_u8& c = m_unique_colors[unique_color_index].m_color;

               int dot = c.r*dirr + c.g*dirg + c.b*dirb;

               uint8 best_color_index;
               if (dot < c02Point)
                  best_color_index = 0;
               else if (dot < c21Point)
                  best_color_index = 2;
               else
                  best_color_index = 1;

               uint best_error = color_distance(m_perceptual, c, colors[best_color_index], false);

               trial_error += best_error * m_unique_colors[unique_color_index].m_weight;
               if (trial_error >= solution.m_error)
                  break;

               m_trial_selectors[unique_color_index] = static_cast<uint8>(best_color_index);
            }
         }

         if (trial_error < solution.m_error)
         {
            solution.m_error = trial_error;
            solution.m_alpha_block = (block_type != 0);
            solution.m_selectors = m_trial_selectors;
            solution.m_valid = true;
         }
      }

      if ((!solution.m_alpha_block) && (solution.m_coords.m_low_color == solution.m_coords.m_high_color))
      {
         uint s;
         if ((solution.m_coords.m_low_color & 31) != 31)
         {
            solution.m_coords.m_low_color++;
            s = 1;
         }
         else
         {
            solution.m_coords.m_high_color--;
            s = 0;
         }

         for (uint i = 0; i < m_unique_colors.size(); i++)
            solution.m_selectors[i] = static_cast<uint8>(s);
      }

      if ((pBest_solution) && (solution.m_error < pBest_solution->m_error))
      {
         *pBest_solution = solution;
         return true;
      }

      return false;
   }

   unique_color dxt1_endpoint_optimizer::lerp_color(const color_quad_u8& a, const color_quad_u8& b, float f, int rounding)
   {
      color_quad_u8 res;

      float r = rounding ? 1.0f : 0.0f;
      res[0] = static_cast<uint8>(math::clamp(math::float_to_int(r + math::lerp<float>(a[0], b[0], f)), 0, 255));
      res[1] = static_cast<uint8>(math::clamp(math::float_to_int(r + math::lerp<float>(a[1], b[1], f)), 0, 255));
      res[2] = static_cast<uint8>(math::clamp(math::float_to_int(r + math::lerp<float>(a[2], b[2], f)), 0, 255));
      res[3] = 255;

      return unique_color(res, 1);
   }

   // The block may have been already compressed using another DXTc compressor, such as squish, ATI_Compress, ryg_dxt, etc.
   // Attempt to recover the endpoints used by that block compressor.
   void dxt1_endpoint_optimizer::try_combinatorial_encoding()
   {
      if ((m_unique_colors.size() < 2) || (m_unique_colors.size() > 4))
         return;

      m_temp_unique_colors = m_unique_colors;

      if (m_temp_unique_colors.size() == 2)
      {
         // a    b    c   d
         // 0.0  1/3 2/3  1.0

         for (uint k = 0; k < 2; k++)
         {
            for (uint q = 0; q < 2; q++)
            {
               const uint r = q ^ 1;

               // a b
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, 2.0f, k));
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, 3.0f, k));

               // a c
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, .5f, k));
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, 1.5f, k));

               // a d

               // b c
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, -1.0f, k));
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, 2.0f, k));

               // b d
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, -.5f, k));
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, .5f, k));

               // c d
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, -2.0f, k));
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[q].m_color, m_temp_unique_colors[r].m_color, -1.0f, k));
            }
         }
      }
      else if (m_temp_unique_colors.size() == 3)
      {
         // a    b    c   d
         // 0.0  1/3 2/3  1.0

         for (uint i = 0; i <= 2; i++)
         {
            for (uint j = 0; j <= 2; j++)
            {
               if (i == j)
                  continue;

               // a b c
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[i].m_color, m_temp_unique_colors[j].m_color, 1.5f));

               // a b d
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[i].m_color, m_temp_unique_colors[j].m_color, 2.0f/3.0f));

               // a c d
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[i].m_color, m_temp_unique_colors[j].m_color, 1.0f/3.0f));

               // b c d
               m_temp_unique_colors.push_back(lerp_color(m_temp_unique_colors[i].m_color, m_temp_unique_colors[j].m_color, -.5f));
            }
         }
      }

      m_unique_packed_colors.resize(0);

      for (uint i = 0; i < m_temp_unique_colors.size(); i++)
      {
         const color_quad_u8& unique_color = m_temp_unique_colors[i].m_color;
         const uint16 packed_color = dxt1_block::pack_color(unique_color, true);

         if (std::find(m_unique_packed_colors.begin(), m_unique_packed_colors.end(), packed_color) != m_unique_packed_colors.end())
            continue;

         m_unique_packed_colors.push_back(packed_color);
      }

      if (m_unique_packed_colors.size() < 2)
         return;

      for (uint alt_rounding = 0; alt_rounding < 2; alt_rounding++)
      {
         for (uint i = 0; i < m_unique_packed_colors.size() - 1; i++)
         {
            for (uint j = i + 1; j < m_unique_packed_colors.size(); j++)
            {
               evaluate_solution(
                  dxt1_solution_coordinates(m_unique_packed_colors[i], m_unique_packed_colors[j]),
                  true,
                  (alt_rounding == 0) ? &m_best_solution : NULL,
                  (alt_rounding != 0));

               if (m_trial_solution.m_error == 0)
               {
                  if (alt_rounding)
                     m_best_solution = m_trial_solution;

                  return;
               }
            }
         }
      }

      return;
   }

   // The fourth (transparent) color in 3 color "transparent" blocks is black, which can be optionally exploited for small gains in DXT1 mode if the caller
   // doesn't actually use alpha. (But not in DXT5 mode, because 3-color blocks aren't permitted by GPU's for DXT5.)
   bool dxt1_endpoint_optimizer::try_alpha_as_black_optimization()
   {
      const params*  pOrig_params = m_pParams;
      pOrig_params;
      results*       pOrig_results = m_pResults;

      uint num_dark_colors = 0;

      for (uint i = 0; i < m_unique_colors.size(); i++)
         if ( (m_unique_colors[i].m_color[0] <= 4) && (m_unique_colors[i].m_color[1] <= 4) && (m_unique_colors[i].m_color[2] <= 4) )
            num_dark_colors++;

      if ( (!num_dark_colors) || (num_dark_colors == m_unique_colors.size()) )
         return true;

      params trial_params(*m_pParams);
      crnlib::vector<color_quad_u8> trial_colors;
      trial_colors.insert(0, m_pParams->m_pPixels, m_pParams->m_num_pixels);

      trial_params.m_pPixels = trial_colors.get_ptr();
      trial_params.m_pixels_have_alpha = true;

      for (uint i = 0; i < trial_colors.size(); i++)
         if ( (trial_colors[i][0] <= 4) && (trial_colors[i][1] <= 4) && (trial_colors[i][2] <= 4) )
            trial_colors[i][3] = 0;

      results trial_results;

      crnlib::vector<uint8> trial_selectors(m_pParams->m_num_pixels);
      trial_results.m_pSelectors = trial_selectors.get_ptr();

      if (!compute_internal(trial_params, trial_results, NULL))
         return false;

      CRNLIB_ASSERT(trial_results.m_alpha_block);

      color_quad_u8 c[4];
      dxt1_block::get_block_colors3(c, trial_results.m_low_color, trial_results.m_high_color);

      uint64 trial_error = 0;

      for (uint i = 0; i < trial_colors.size(); i++)
      {
         if (trial_colors[i][3] == 0)
         {
            CRNLIB_ASSERT(trial_selectors[i] == 3);
         }
         else
         {
            CRNLIB_ASSERT(trial_selectors[i] != 3);
         }

         trial_error += color_distance(m_perceptual, trial_colors[i], c[trial_selectors[i]], false);
      }

      if (trial_error < pOrig_results->m_error)
      {
         pOrig_results->m_error = trial_error;

         pOrig_results->m_low_color = trial_results.m_low_color;
         pOrig_results->m_high_color = trial_results.m_high_color;

         if (pOrig_results->m_pSelectors)
            memcpy(pOrig_results->m_pSelectors, trial_results.m_pSelectors, m_pParams->m_num_pixels);

         pOrig_results->m_alpha_block = true;
      }

      return true;
   }

   bool dxt1_endpoint_optimizer::compute_internal(const params& p, results& r, solution_vec* pSolutions)
   {
      clear();

      m_pParams = &p;
      m_pResults = &r;
      m_pSolutions = pSolutions;

      m_has_color_weighting = (m_pParams->m_color_weights[0] != 1) || (m_pParams->m_color_weights[1] != 1) || (m_pParams->m_color_weights[2] != 1);
      m_perceptual = m_pParams->m_perceptual && !m_has_color_weighting && !m_pParams->m_grayscale_sampling;

      find_unique_colors();

      m_best_solution.clear();

      if (m_unique_colors.empty())
         return handle_all_transparent_block();
      else if ((m_unique_colors.size() == 1) && (!m_has_transparent_pixels))
         return handle_solid_block();
      else
      {
         if (!handle_multicolor_block())
            return false;

         if ((m_all_pixels_grayscale) && (m_best_solution.m_error))
         {
            if (!handle_grayscale_block())
               return false;
         }
      }

      return true;
   }

   bool dxt1_endpoint_optimizer::compute(const params& p, results& r, solution_vec* pSolutions)
   {
      if (!p.m_pPixels)
         return false;

      bool status = compute_internal(p, r, pSolutions);
      if (!status)
         return false;

      if ( (m_pParams->m_use_alpha_blocks) && (m_pParams->m_use_transparent_indices_for_black) && (!m_pParams->m_pixels_have_alpha) && (!pSolutions) )
      {
         if (!try_alpha_as_black_optimization())
            return false;
      }

      return true;
   }

   // Build array of unique colors and their weights.
   void dxt1_endpoint_optimizer::find_unique_colors()
   {
      m_has_transparent_pixels = false;

      uint num_opaque_pixels = 0;

      const uint alpha_thresh = m_pParams->m_pixels_have_alpha ? (m_pParams->m_dxt1a_alpha_threshold << 24U) : 0;

      const uint32* pSrc_pixels = reinterpret_cast<const uint32*>(m_pParams->m_pPixels);
      const uint32* pSrc_pixels_end = pSrc_pixels + m_pParams->m_num_pixels;

      m_unique_colors.resize(m_pParams->m_num_pixels);
      uint num_unique_colors = 0;

      m_all_pixels_grayscale = true;

      do
      {
         uint32 c = utils::read_le32(pSrc_pixels);
         pSrc_pixels++;

         if (c < alpha_thresh)
         {
            m_has_transparent_pixels = true;
            continue;
         }

         if (m_all_pixels_grayscale)
         {
            uint r = c & 0xFF;
            uint g = (c >> 8) & 0xFF;
            uint b = (c >> 16) & 0xFF;
            if ((r != g) || (r != b))
               m_all_pixels_grayscale = false;
         }

         c |= 0xFF000000U;

         unique_color_hash_map::insert_result ins_result(m_unique_color_hash_map.insert(c, num_unique_colors));

         if (ins_result.second)
         {
            utils::write_le32(&m_unique_colors[num_unique_colors].m_color.m_u32, c);
            m_unique_colors[num_unique_colors].m_weight = 1;
            num_unique_colors++;
         }
         else
            m_unique_colors[ins_result.first->second].m_weight++;

         num_opaque_pixels++;

      } while (pSrc_pixels != pSrc_pixels_end);

      m_unique_colors.resize(num_unique_colors);

      m_total_unique_color_weight = num_opaque_pixels;
   }

} // namespace crnlib