TextureDecompress: Re-add non-SSE code paths

This commit is contained in:
Stenzek 2023-12-25 14:15:37 +10:00 committed by Connor McLaughlin
parent 04d7d1a1db
commit 122f1ec767
1 changed files with 73 additions and 17 deletions

View File

@ -469,12 +469,21 @@ void DecompressBlockBC5 (uint32_t x, uint32_t y, uint32_t stride, enum BC5Mode m
// File: bc7decomp.c - Richard Geldreich, Jr. 3/31/2020 - MIT license or public domain (see end of file)
#include <string.h>
#if (defined(_M_AMD64) || defined(__x86_64__) || defined(__SSE2__))
# define BC7DECOMP_USE_SSE2
#endif
#ifdef BC7DECOMP_USE_SSE2
#include <immintrin.h>
#include <emmintrin.h>
#endif
namespace bc7decomp
{
static const __m128i g_bc7_weights4_sse2[8] =
#ifdef BC7DECOMP_USE_SSE2
const __m128i g_bc7_weights4_sse2[8] =
{
_mm_set_epi16(4, 4, 4, 4, 0, 0, 0, 0),
_mm_set_epi16(13, 13, 13, 13, 9, 9, 9, 9),
@ -485,12 +494,13 @@ namespace bc7decomp
_mm_set_epi16(55, 55, 55, 55, 51, 51, 51, 51),
_mm_set_epi16(64, 64, 64, 64, 60, 60, 60, 60),
};
#endif
static const uint32_t g_bc7_weights2[4] = { 0, 21, 43, 64 };
static const uint32_t g_bc7_weights3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 };
static const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
const uint32_t g_bc7_weights2[4] = { 0, 21, 43, 64 };
const uint32_t g_bc7_weights3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 };
const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
static const uint8_t g_bc7_partition2[64 * 16] =
const uint8_t g_bc7_partition2[64 * 16] =
{
0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1, 0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1, 0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1, 0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1, 0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1, 0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1, 0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,
@ -502,7 +512,7 @@ static const uint8_t g_bc7_partition2[64 * 16] =
0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1, 0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1, 0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1, 0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1, 0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0, 0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0, 0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1
};
static const uint8_t g_bc7_partition3[64 * 16] =
const uint8_t g_bc7_partition3[64 * 16] =
{
0,0,1,1,0,0,1,1,0,2,2,1,2,2,2,2, 0,0,0,1,0,0,1,1,2,2,1,1,2,2,2,1, 0,0,0,0,2,0,0,1,2,2,1,1,2,2,1,1, 0,2,2,2,0,0,2,2,0,0,1,1,0,1,1,1, 0,0,0,0,0,0,0,0,1,1,2,2,1,1,2,2, 0,0,1,1,0,0,1,1,0,0,2,2,0,0,2,2, 0,0,2,2,0,0,2,2,1,1,1,1,1,1,1,1, 0,0,1,1,0,0,1,1,2,2,1,1,2,2,1,1,
0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2, 0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2, 0,0,0,0,1,1,1,1,2,2,2,2,2,2,2,2, 0,0,1,2,0,0,1,2,0,0,1,2,0,0,1,2, 0,1,1,2,0,1,1,2,0,1,1,2,0,1,1,2, 0,1,2,2,0,1,2,2,0,1,2,2,0,1,2,2, 0,0,1,1,0,1,1,2,1,1,2,2,1,2,2,2, 0,0,1,1,2,0,0,1,2,2,0,0,2,2,2,0,
@ -514,19 +524,19 @@ static const uint8_t g_bc7_partition3[64 * 16] =
0,1,1,0,0,1,1,0,2,2,2,2,2,2,2,2, 0,0,2,2,0,0,1,1,0,0,1,1,0,0,2,2, 0,0,2,2,1,1,2,2,1,1,2,2,0,0,2,2, 0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,2, 0,0,0,2,0,0,0,1,0,0,0,2,0,0,0,1, 0,2,2,2,1,2,2,2,0,2,2,2,1,2,2,2, 0,1,0,1,2,2,2,2,2,2,2,2,2,2,2,2, 0,1,1,1,2,0,1,1,2,2,0,1,2,2,2,0,
};
static const uint8_t g_bc7_table_anchor_index_second_subset[64] = { 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, 15, 2, 8, 2, 2, 8, 8,15, 2, 8, 2, 2, 8, 8, 2, 2, 15,15, 6, 8, 2, 8,15,15, 2, 8, 2, 2, 2,15,15, 6, 6, 2, 6, 8,15,15, 2, 2, 15,15,15,15,15, 2, 2,15 };
const uint8_t g_bc7_table_anchor_index_second_subset[64] = { 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, 15, 2, 8, 2, 2, 8, 8,15, 2, 8, 2, 2, 8, 8, 2, 2, 15,15, 6, 8, 2, 8,15,15, 2, 8, 2, 2, 2,15,15, 6, 6, 2, 6, 8,15,15, 2, 2, 15,15,15,15,15, 2, 2,15 };
static const uint8_t g_bc7_table_anchor_index_third_subset_1[64] =
const uint8_t g_bc7_table_anchor_index_third_subset_1[64] =
{
3, 3,15,15, 8, 3,15,15, 8, 8, 6, 6, 6, 5, 3, 3, 3, 3, 8,15, 3, 3, 6,10, 5, 8, 8, 6, 8, 5,15,15, 8,15, 3, 5, 6,10, 8,15, 15, 3,15, 5,15,15,15,15, 3,15, 5, 5, 5, 8, 5,10, 5,10, 8,13,15,12, 3, 3
};
static const uint8_t g_bc7_table_anchor_index_third_subset_2[64] =
const uint8_t g_bc7_table_anchor_index_third_subset_2[64] =
{
15, 8, 8, 3,15,15, 3, 8, 15,15,15,15,15,15,15, 8, 15, 8,15, 3,15, 8,15, 8, 3,15, 6,10,15,15,10, 8, 15, 3,15,10,10, 8, 9,10, 6,15, 8,15, 3, 6, 6, 8, 15, 3,15,15,15,15,15,15, 15,15,15,15, 3,15,15, 8
};
static const uint8_t g_bc7_first_byte_to_mode[256] =
const uint8_t g_bc7_first_byte_to_mode[256] =
{
8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
@ -546,7 +556,7 @@ static const uint8_t g_bc7_first_byte_to_mode[256] =
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
};
static inline void insert_weight_zero(uint64_t& index_bits, uint32_t bits_per_index, uint32_t offset)
inline void insert_weight_zero(uint64_t& index_bits, uint32_t bits_per_index, uint32_t offset)
{
uint64_t LOW_BIT_MASK = (static_cast<uint64_t>(1) << ((bits_per_index * (offset + 1)) - 1)) - 1;
uint64_t HIGH_BIT_MASK = ~LOW_BIT_MASK;
@ -577,6 +587,8 @@ static inline uint32_t bc7_interp(uint32_t l, uint32_t h, uint32_t w, uint32_t b
return 0;
}
#ifdef BC7DECOMP_USE_SSE2
static inline __m128i bc7_interp_sse2(__m128i l, __m128i h, __m128i w, __m128i iw)
{
return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(l, iw), _mm_mullo_epi16(h, w)), _mm_set1_epi16(32)), 6);
@ -619,8 +631,9 @@ static inline void bc7_interp3_sse2(const color_rgba* endpoint_pair, color_rgba*
_mm_storeu_si128(reinterpret_cast<__m128i*>(out_colors), all_colors_0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(out_colors + 4), all_colors_1);
}
#endif
static bool unpack_bc7_mode0_2(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels)
bool unpack_bc7_mode0_2(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels)
{
//const uint32_t SUBSETS = 3;
const uint32_t ENDPOINTS = 6;
@ -630,6 +643,9 @@ static bool unpack_bc7_mode0_2(uint32_t mode, const uint64_t* data_chunks, color
const uint32_t ENDPOINT_BITS = (mode == 0) ? 4 : 5;
const uint32_t ENDPOINT_MASK = (1 << ENDPOINT_BITS) - 1;
const uint32_t PBITS = (mode == 0) ? 6 : 0;
#ifndef BC7DECOMP_USE_SSE2
const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS;
#endif
const uint32_t PART_BITS = (mode == 0) ? 4 : 6;
const uint32_t PART_MASK = (1 << PART_BITS) - 1;
@ -691,6 +707,7 @@ static bool unpack_bc7_mode0_2(uint32_t mode, const uint64_t* data_chunks, color
color_rgba block_colors[3][8];
#ifdef BC7DECOMP_USE_SSE2
for (uint32_t s = 0; s < 3; s++)
{
if (WEIGHT_BITS == 2)
@ -698,6 +715,15 @@ static bool unpack_bc7_mode0_2(uint32_t mode, const uint64_t* data_chunks, color
else
bc7_interp3_sse2(endpoints + s * 2, block_colors[s]);
}
#else
for (uint32_t s = 0; s < 3; s++)
for (uint32_t i = 0; i < WEIGHT_VALS; i++)
{
for (uint32_t c = 0; c < 3; c++)
block_colors[s][i][c] = static_cast<uint8_t>(bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS));
block_colors[s][i][3] = 255;
}
#endif
for (uint32_t i = 0; i < 16; i++)
pPixels[i] = block_colors[g_bc7_partition3[part * 16 + i]][weights[i]];
@ -705,7 +731,7 @@ static bool unpack_bc7_mode0_2(uint32_t mode, const uint64_t* data_chunks, color
return true;
}
static bool unpack_bc7_mode1_3_7(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels)
bool unpack_bc7_mode1_3_7(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels)
{
//const uint32_t SUBSETS = 2;
const uint32_t ENDPOINTS = 4;
@ -716,6 +742,9 @@ static bool unpack_bc7_mode1_3_7(uint32_t mode, const uint64_t* data_chunks, col
const uint32_t ENDPOINT_MASK = (1 << ENDPOINT_BITS) - 1;
const uint32_t PBITS = (mode == 1) ? 2 : 4;
const uint32_t SHARED_PBITS = (mode == 1) ? true : false;
#ifndef BC7DECOMP_USE_SSE2
const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS;
#endif
const uint64_t low_chunk = data_chunks[0];
const uint64_t high_chunk = data_chunks[1];
@ -783,6 +812,7 @@ static bool unpack_bc7_mode1_3_7(uint32_t mode, const uint64_t* data_chunks, col
endpoints[e][c] = static_cast<uint8_t>((mode != 7U && c == 3U) ? 255 : bc7_dequant(endpoints[e][c], pbits[SHARED_PBITS ? (e >> 1) : e], ENDPOINT_BITS));
color_rgba block_colors[2][8];
#ifdef BC7DECOMP_USE_SSE2
for (uint32_t s = 0; s < 2; s++)
{
if (WEIGHT_BITS == 2)
@ -790,6 +820,15 @@ static bool unpack_bc7_mode1_3_7(uint32_t mode, const uint64_t* data_chunks, col
else
bc7_interp3_sse2(endpoints + s * 2, block_colors[s]);
}
#else
for (uint32_t s = 0; s < 2; s++)
for (uint32_t i = 0; i < WEIGHT_VALS; i++)
{
for (uint32_t c = 0; c < COMPS; c++)
block_colors[s][i][c] = static_cast<uint8_t>(bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS));
block_colors[s][i][3] = (COMPS == 3) ? 255 : block_colors[s][i][3];
}
#endif
for (uint32_t i = 0; i < 16; i++)
pPixels[i] = block_colors[g_bc7_partition2[part * 16 + i]][weights[i]];
@ -797,7 +836,7 @@ static bool unpack_bc7_mode1_3_7(uint32_t mode, const uint64_t* data_chunks, col
return true;
}
static bool unpack_bc7_mode4_5(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels)
bool unpack_bc7_mode4_5(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels)
{
const uint32_t ENDPOINTS = 2;
//const uint32_t COMPS = 4;
@ -879,10 +918,16 @@ static bool unpack_bc7_mode4_5(uint32_t mode, const uint64_t* data_chunks, color
endpoints[e][c] = static_cast<uint8_t>(bc7_dequant(endpoints[e][c], (c == 3) ? A_ENDPOINT_BITS : ENDPOINT_BITS));
color_rgba block_colors[8];
#ifdef BC7DECOMP_USE_SSE2
if (weight_bits[0] == 3)
bc7_interp3_sse2(endpoints, block_colors);
else
bc7_interp2_sse2(endpoints, block_colors);
#else
for (uint32_t i = 0; i < (1U << weight_bits[0]); i++)
for (uint32_t c = 0; c < 3; c++)
block_colors[i][c] = static_cast<uint8_t>(bc7_interp(endpoints[0][c], endpoints[1][c], i, weight_bits[0]));
#endif
for (uint32_t i = 0; i < (1U << weight_bits[1]); i++)
block_colors[i][3] = static_cast<uint8_t>(bc7_interp(endpoints[0][3], endpoints[1][3], i, weight_bits[1]));
@ -898,7 +943,6 @@ static bool unpack_bc7_mode4_5(uint32_t mode, const uint64_t* data_chunks, color
return true;
}
namespace {
struct bc7_mode_6
{
struct
@ -945,9 +989,8 @@ struct bc7_mode_6
uint64_t m_hi_bits;
};
};
}
static bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels)
bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels)
{
static_assert(sizeof(bc7_mode_6) == 16, "sizeof(bc7_mode_6) == 16");
@ -966,6 +1009,7 @@ static bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels)
const uint32_t a1 = static_cast<uint32_t>((block.m_lo.m_a1 << 1) | block.m_hi.m_p1);
color_rgba vals[16];
#ifdef BC7DECOMP_USE_SSE2
__m128i vep0 = _mm_set_epi16((short)a0, (short)b0, (short)g0, (short)r0, (short)a0, (short)b0, (short)g0, (short)r0);
__m128i vep1 = _mm_set_epi16((short)a1, (short)b1, (short)g1, (short)r1, (short)a1, (short)b1, (short)g1, (short)r1);
@ -983,6 +1027,18 @@ static bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels)
_mm_storeu_si128(reinterpret_cast<__m128i*>(vals + i), combined);
}
#else
for (uint32_t i = 0; i < 16; i++)
{
const uint32_t w = g_bc7_weights4[i];
const uint32_t iw = 64 - w;
vals[i].set_noclamp_rgba(
(r0 * iw + r1 * w + 32) >> 6,
(g0 * iw + g1 * w + 32) >> 6,
(b0 * iw + b1 * w + 32) >> 6,
(a0 * iw + a1 * w + 32) >> 6);
}
#endif
pPixels[0] = vals[block.m_hi.m_s00];
pPixels[1] = vals[block.m_hi.m_s10];